bitkeeper revision 1.1159.250.1 (4208d729hvKh9E4GWJWvFDThd8OXyA)
authorkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Tue, 8 Feb 2005 15:13:45 +0000 (15:13 +0000)
committerkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Tue, 8 Feb 2005 15:13:45 +0000 (15:13 +0000)
More x86_64 fixes/cleanups. NB. update_va_mapping hypercalls now
accept a virtual address, *not* a "virtual page number".
Signed-off-by: keir.fraser@cl.cam.ac.uk
18 files changed:
.rootkeys
linux-2.4.29-xen-sparse/mm/memory.c
linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c
linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c
linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c
linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c
linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h
linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h
netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
xen/arch/x86/memory.c [deleted file]
xen/arch/x86/mm.c [new file with mode: 0644]
xen/arch/x86/traps.c
xen/common/dom_mem_ops.c
xen/include/asm-x86/page.h
xen/include/asm-x86/x86_32/regs.h
xen/include/asm-x86/x86_64/regs.h
xen/include/asm-x86/x86_64/uaccess.h

index d79a03aab0417dafbbe943650e26c119ee40a37e..7a3f24fd752a862c686321a8f37963b10f2269f1 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/x86/idle0_task.c
 3ddb79bcKIkRR0kqWaJhe5VUDkMdxg xen/arch/x86/io_apic.c
 3ddb79bdqfIcjkz_h9Hvtp8Tk_19Zw xen/arch/x86/irq.c
-40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/memory.c
 41d54a76qfpO0VnbL2tYs0Jgt3W3XA xen/arch/x86/microcode.c
+40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/mm.c
 3ddb79bdS4UeWWXDH-FaBKqcpMFcnw xen/arch/x86/mpparse.c
 41aaf566Z4sTDgJ77eEg0TzzQ1ka6Q xen/arch/x86/mtrr/amd.c
 41aaf566TOpOBXT00wwQGUh20f1rlA xen/arch/x86/mtrr/centaur.c
index 1ae2a429882e17c57605082f68998cd430e045c4..7d81c86589c5550448163fc8d5d1f18a782725c3 100644 (file)
@@ -915,7 +915,7 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr
 #ifdef CONFIG_XEN
        if ( likely(vma->vm_mm == current->mm) ) {
                XEN_flush_page_update_queue();
-               HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, UVMF_INVLPG);
+               HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG);
        } else {
                set_pte(page_table, entry);
                flush_tlb_page(vma, address);
@@ -1191,7 +1191,7 @@ static int do_swap_page(struct mm_struct * mm,
 #ifdef CONFIG_XEN
        if ( likely(vma->vm_mm == current->mm) ) {
                XEN_flush_page_update_queue();
-               HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, pte, 0);
+               HYPERVISOR_update_va_mapping(address, pte, 0);
        } else {
                set_pte(page_table, pte);
                XEN_flush_page_update_queue();
@@ -1247,7 +1247,7 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma,
 #ifdef CONFIG_XEN
        if ( likely(vma->vm_mm == current->mm) ) {
                XEN_flush_page_update_queue();
-               HYPERVISOR_update_va_mapping(addr>>PAGE_SHIFT, entry, 0);
+               HYPERVISOR_update_va_mapping(addr, entry, 0);
        } else {
                set_pte(page_table, entry);
                XEN_flush_page_update_queue();
@@ -1333,7 +1333,7 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 #ifdef CONFIG_XEN
                if ( likely(vma->vm_mm == current->mm) ) {
                        XEN_flush_page_update_queue();
-                       HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, 0);
+                       HYPERVISOR_update_va_mapping(address, entry, 0);
                } else {
                        set_pte(page_table, entry);
                        XEN_flush_page_update_queue();
index 6167f6061bf08fa3eb7776ace1f0e10d95feaaee..4f74a1c5142d95979b1fe095647cf75b419a0740 100644 (file)
@@ -95,7 +95,7 @@ static void fast_flush_area(int idx, int nr_pages)
     for ( i = 0; i < nr_pages; i++ )
     {
         mcl[i].op = __HYPERVISOR_update_va_mapping;
-        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
+        mcl[i].args[0] = MMAP_VADDR(idx, i);
         mcl[i].args[1] = 0;
         mcl[i].args[2] = 0;
     }
@@ -343,14 +343,14 @@ static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
 
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
     if ( HYPERVISOR_update_va_mapping_otherdomain(
-        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
+        MMAP_VADDR(pending_idx, 0),
         (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
         0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) )
         
         goto out;
 #else
     if ( HYPERVISOR_update_va_mapping_otherdomain(
-        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
+        MMAP_VADDR(pending_idx, 0),
         (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
         0, blkif->domid) ) 
         
@@ -436,7 +436,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
     for ( i = 0; i < nr_psegs; i++ )
     {
         mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
-        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
+        mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
         mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
         mcl[i].args[2] = 0;
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
index b75b6169a759cf64ea1d69e25dee163c7c2aa09e..5f5b99cce328f5dde5c94ac0724e069048e2a6c2 100644 (file)
@@ -234,7 +234,7 @@ static void net_rx_action(unsigned long unused)
         mmu[2].val  = MMUEXT_REASSIGN_PAGE;
 
         mcl[0].op = __HYPERVISOR_update_va_mapping;
-        mcl[0].args[0] = vdata >> PAGE_SHIFT;
+        mcl[0].args[0] = vdata;
         mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
         mcl[0].args[2] = 0;
         mcl[1].op = __HYPERVISOR_mmu_update;
@@ -409,7 +409,7 @@ static void net_tx_action(unsigned long unused)
     {
         pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
         mcl[0].op = __HYPERVISOR_update_va_mapping;
-        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx);
         mcl[0].args[1] = 0;
         mcl[0].args[2] = 0;
         mcl++;     
@@ -546,7 +546,7 @@ static void net_tx_action(unsigned long unused)
         skb_reserve(skb, 16);
 
         mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
-        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx);
         mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
         mcl[0].args[2] = 0;
         mcl[0].args[3] = netif->domid;
index ee53c8b1a6b486ef69211b1fdd5937a3cf89d0ab..7e8c434812d10633a1599a516efffcb7e674c883 100644 (file)
@@ -392,7 +392,7 @@ static void network_alloc_rx_buffers(struct net_device *dev)
            = INVALID_P2M_ENTRY;
 
         rx_mcl[i].op = __HYPERVISOR_update_va_mapping;
-        rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+        rx_mcl[i].args[0] = (unsigned long)skb->head;
         rx_mcl[i].args[1] = 0;
         rx_mcl[i].args[2] = 0;
     }
@@ -593,7 +593,7 @@ static int netif_poll(struct net_device *dev, int *pbudget)
         mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
         mmu++;
         mcl->op = __HYPERVISOR_update_va_mapping;
-        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+        mcl->args[0] = (unsigned long)skb->head;
         mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
         mcl->args[2] = 0;
         mcl++;
index 35fcdd84e4ca452b7c6ad83ec538b043c4a060d5..e8ad4e422d0bf04a61edbd7fae1c3fadc4d60222 100644 (file)
@@ -191,7 +191,7 @@ static void fast_flush_area(int idx, int nr_pages)
     for ( i = 0; i < nr_pages; i++ )
     {
         mcl[i].op = __HYPERVISOR_update_va_mapping;
-        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
+        mcl[i].args[0] = MMAP_VADDR(idx, i);
         mcl[i].args[1] = 0;
         mcl[i].args[2] = 0;
     }
@@ -630,7 +630,7 @@ static void dispatch_usb_io(usbif_priv_t *up, usbif_request_t *req)
           i++, offset += PAGE_SIZE )
     {
        mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
-       mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
+       mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
         mcl[i].args[1] = ((buffer_mach & PAGE_MASK) + offset) | remap_prot;
         mcl[i].args[2] = 0;
         mcl[i].args[3] = up->domid;
@@ -646,7 +646,7 @@ static void dispatch_usb_io(usbif_priv_t *up, usbif_request_t *req)
     {
         /* Map in ISO schedule, if necessary. */
         mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
-        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
+        mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
         mcl[i].args[1] = (req->iso_schedule & PAGE_MASK) | remap_prot;
         mcl[i].args[2] = 0;
         mcl[i].args[3] = up->domid;
index 3c0ae34e9796093b298b78b1c31d60fbd2078d4c..69e0fbf0ba9a2d4ee9c599377f662687aadfc475 100644 (file)
@@ -426,7 +426,7 @@ extern pte_t *lookup_address(unsigned long address);
                if (__dirty) {                                            \
                        if ( likely((__vma)->vm_mm == current->mm) ) {    \
                            xen_flush_page_update_queue();                \
-                           HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, (__entry), UVMF_INVLPG); \
+                           HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \
                        } else {                                          \
                             xen_l1_entry_update((__ptep), (__entry).pte_low); \
                            flush_tlb_page((__vma), (__address));         \
@@ -445,7 +445,7 @@ do {                                                                        \
 do {                                                                   \
        if (likely((__vma)->vm_mm == current->mm)) {                    \
                xen_flush_page_update_queue();                          \
-               HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT,   \
+               HYPERVISOR_update_va_mapping((__address),               \
                                             __entry, 0);               \
        } else {                                                        \
                xen_l1_entry_update((__ptep), (__entry).pte_low);       \
index e54caa1b99485f73368f8f67a4ebc9a62c7dce8c..f6fcff4e08a64221439f66014e1cbc2f8e42e40c 100644 (file)
@@ -438,7 +438,7 @@ HYPERVISOR_multicall(
 
 static inline int
 HYPERVISOR_update_va_mapping(
-    unsigned long page_nr, pte_t new_val, unsigned long flags)
+    unsigned long nr, pte_t new_val, unsigned long flags)
 {
     int ret;
     unsigned long ign1, ign2, ign3;
@@ -447,13 +447,13 @@ HYPERVISOR_update_va_mapping(
         TRAP_INSTR
         : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
        : "0" (__HYPERVISOR_update_va_mapping), 
-          "1" (page_nr), "2" ((new_val).pte_low), "3" (flags)
+          "1" (va), "2" ((new_val).pte_low), "3" (flags)
        : "memory" );
 
     if ( unlikely(ret < 0) )
     {
         printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
-               page_nr, (new_val).pte_low, flags);
+               va, (new_val).pte_low, flags);
         BUG();
     }
 
@@ -540,7 +540,7 @@ HYPERVISOR_grant_table_op(
 
 static inline int
 HYPERVISOR_update_va_mapping_otherdomain(
-    unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid)
+    unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
 {
     int ret;
     unsigned long ign1, ign2, ign3, ign4;
@@ -549,7 +549,7 @@ HYPERVISOR_update_va_mapping_otherdomain(
         TRAP_INSTR
         : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
        : "0" (__HYPERVISOR_update_va_mapping_otherdomain),
-          "1" (page_nr), "2" ((new_val).pte_low), "3" (flags), "4" (domid) :
+          "1" (va), "2" ((new_val).pte_low), "3" (flags), "4" (domid) :
         "memory" );
     
     return ret;
index 035495d4be4a406c0d9b58a71fa91bd7d916d1b0..eb91c2a980419c128d386213355f9fe491714891 100644 (file)
@@ -398,7 +398,7 @@ HYPERVISOR_multicall(void *call_list, int nr_calls)
 }
 
 static inline int
-HYPERVISOR_update_va_mapping(unsigned long page_nr, unsigned long new_val,
+HYPERVISOR_update_va_mapping(unsigned long va, unsigned long new_val,
     unsigned long flags)
 {
     int ret;
@@ -408,12 +408,12 @@ HYPERVISOR_update_va_mapping(unsigned long page_nr, unsigned long new_val,
         TRAP_INSTR
         : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
        : "0" (__HYPERVISOR_update_va_mapping), 
-          "1" (page_nr), "2" (new_val), "3" (flags)
+          "1" (va), "2" (new_val), "3" (flags)
        : "memory" );
 
     if (__predict_false(ret < 0))
         panic("Failed update VA mapping: %08lx, %08lx, %08lx",
-              page_nr, new_val, flags);
+              va, new_val, flags);
 
     return ret;
 }
@@ -494,7 +494,7 @@ HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
 }
 
 static inline int
-HYPERVISOR_update_va_mapping_otherdomain(unsigned long page_nr,
+HYPERVISOR_update_va_mapping_otherdomain(unsigned long va,
     unsigned long new_val, unsigned long flags, domid_t domid)
 {
     int ret;
@@ -504,7 +504,7 @@ HYPERVISOR_update_va_mapping_otherdomain(unsigned long page_nr,
         TRAP_INSTR
         : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
        : "0" (__HYPERVISOR_update_va_mapping_otherdomain),
-          "1" (page_nr), "2" (new_val), "3" (flags), "4" (domid) :
+          "1" (va), "2" (new_val), "3" (flags), "4" (domid) :
         "memory" );
     
     return ret;
index 9d8618923c18fd9843e3fb9efcc88b0749ea803a..e1063b1775c0f8e53c8a173a2743facbbd493877 100644 (file)
@@ -580,7 +580,7 @@ xennet_rx_push_buffer(struct xennet_softc *sc, int id)
                INVALID_P2M_ENTRY;
 
        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
-       rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT;
+       rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va;
        rx_mcl[nr_pfns].args[1] = 0;
        rx_mcl[nr_pfns].args[2] = 0;
 
@@ -679,7 +679,7 @@ xen_network_handler(void *arg)
                mmu->val  = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
                mmu++;
                mcl->op = __HYPERVISOR_update_va_mapping;
-               mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT;
+               mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va;
                mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW;
                mcl->args[2] = UVMF_FLUSH_TLB; // 0;
                mcl++;
@@ -872,7 +872,7 @@ network_alloc_rx_buffers(struct xennet_softc *sc)
                        INVALID_P2M_ENTRY;
 
                rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
-               rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT;
+               rx_mcl[nr_pfns].args[0] = va;
                rx_mcl[nr_pfns].args[1] = 0;
                rx_mcl[nr_pfns].args[2] = 0;
 
diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c
deleted file mode 100644 (file)
index 36a1463..0000000
+++ /dev/null
@@ -1,2588 +0,0 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
-/******************************************************************************
- * arch/x86/memory.c
- * 
- * Copyright (c) 2002-2004 K A Fraser
- * Copyright (c) 2004 Christian Limpach
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-/*
- * A description of the x86 page table API:
- * 
- * Domains trap to do_mmu_update with a list of update requests.
- * This is a list of (ptr, val) pairs, where the requested operation
- * is *ptr = val.
- * 
- * Reference counting of pages:
- * ----------------------------
- * Each page has two refcounts: tot_count and type_count.
- * 
- * TOT_COUNT is the obvious reference count. It counts all uses of a
- * physical page frame by a domain, including uses as a page directory,
- * a page table, or simple mappings via a PTE. This count prevents a
- * domain from releasing a frame back to the free pool when it still holds
- * a reference to it.
- * 
- * TYPE_COUNT is more subtle. A frame can be put to one of three
- * mutually-exclusive uses: it might be used as a page directory, or a
- * page table, or it may be mapped writable by the domain [of course, a
- * frame may not be used in any of these three ways!].
- * So, type_count is a count of the number of times a frame is being 
- * referred to in its current incarnation. Therefore, a page can only
- * change its type when its type count is zero.
- * 
- * Pinning the page type:
- * ----------------------
- * The type of a page can be pinned/unpinned with the commands
- * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
- * pinning is not reference counted, so it can't be nested).
- * This is useful to prevent a page's type count falling to zero, at which
- * point safety checks would need to be carried out next time the count
- * is increased again.
- * 
- * A further note on writable page mappings:
- * -----------------------------------------
- * For simplicity, the count of writable mappings for a page may not
- * correspond to reality. The 'writable count' is incremented for every
- * PTE which maps the page with the _PAGE_RW flag set. However, for
- * write access to be possible the page directory entry must also have
- * its _PAGE_RW bit set. We do not check this as it complicates the 
- * reference counting considerably [consider the case of multiple
- * directory entries referencing a single page table, some with the RW
- * bit set, others not -- it starts getting a bit messy].
- * In normal use, this simplification shouldn't be a problem.
- * However, the logic can be added if required.
- * 
- * One more note on read-only page mappings:
- * -----------------------------------------
- * We want domains to be able to map pages for read-only access. The
- * main reason is that page tables and directories should be readable
- * by a domain, but it would not be safe for them to be writable.
- * However, domains have free access to rings 1 & 2 of the Intel
- * privilege model. In terms of page protection, these are considered
- * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
- * read-only restrictions are respected in supervisor mode -- if the 
- * bit is clear then any mapped page is writable.
- * 
- * We get round this by always setting the WP bit and disallowing 
- * updates to it. This is very unlikely to cause a problem for guest
- * OS's, which will generally use the WP bit to simplify copy-on-write
- * implementation (in that case, OS wants a fault when it writes to
- * an application-supplied buffer).
- */
-
-#include <xen/config.h>
-#include <xen/init.h>
-#include <xen/kernel.h>
-#include <xen/lib.h>
-#include <xen/mm.h>
-#include <xen/sched.h>
-#include <xen/errno.h>
-#include <xen/perfc.h>
-#include <xen/irq.h>
-#include <xen/softirq.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <asm/flushtlb.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/domain_page.h>
-#include <asm/ldt.h>
-
-#ifdef VERBOSE
-#define MEM_LOG(_f, _a...)                           \
-  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
-         current->domain->id , __LINE__ , ## _a )
-#else
-#define MEM_LOG(_f, _a...) ((void)0)
-#endif
-
-static int alloc_l2_table(struct pfn_info *page);
-static int alloc_l1_table(struct pfn_info *page);
-static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
-static int get_page_and_type_from_pagenr(unsigned long page_nr, 
-                                         u32 type,
-                                         struct domain *d);
-
-static void free_l2_table(struct pfn_info *page);
-static void free_l1_table(struct pfn_info *page);
-
-static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
-static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
-
-/* Used to defer flushing of memory structures. */
-static struct {
-#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
-#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
-    unsigned long  deferred_ops;
-    /* If non-NULL, specifies a foreign subject domain for some operations. */
-    struct domain *foreign;
-} __cacheline_aligned percpu_info[NR_CPUS];
-
-/*
- * Returns the current foreign domain; defaults to the currently-executing
- * domain if a foreign override hasn't been specified.
- */
-#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
-
-/* Private domain structs for DOMID_XEN and DOMID_IO. */
-static struct domain *dom_xen, *dom_io;
-
-/* Frame table and its size in pages. */
-struct pfn_info *frame_table;
-unsigned long frame_table_size;
-unsigned long max_page;
-
-void __init init_frametable(void)
-{
-    unsigned long i, p;
-
-    frame_table      = (struct pfn_info *)FRAMETABLE_VIRT_START;
-    frame_table_size = max_page * sizeof(struct pfn_info);
-    frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
-
-    for ( i = 0; i < frame_table_size; i += (4UL << 20) )
-    {
-        p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
-        if ( p == 0 )
-            panic("Not enough memory for frame table\n");
-        map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 
-                  4UL << 20, PAGE_HYPERVISOR);
-    }
-
-    memset(frame_table, 0, frame_table_size);
-}
-
-void arch_init_memory(void)
-{
-    extern void subarch_init_memory(struct domain *);
-
-    memset(percpu_info, 0, sizeof(percpu_info));
-
-    /*
-     * Initialise our DOMID_XEN domain.
-     * Any Xen-heap pages that we will allow to be mapped will have
-     * their domain field set to dom_xen.
-     */
-    dom_xen = alloc_domain_struct();
-    atomic_set(&dom_xen->refcnt, 1);
-    dom_xen->id = DOMID_XEN;
-
-    /*
-     * Initialise our DOMID_IO domain.
-     * This domain owns no pages but is considered a special case when
-     * mapping I/O pages, as the mappings occur at the priv of the caller.
-     */
-    dom_io = alloc_domain_struct();
-    atomic_set(&dom_io->refcnt, 1);
-    dom_io->id = DOMID_IO;
-
-    subarch_init_memory(dom_xen);
-}
-
-void write_ptbase(struct exec_domain *ed)
-{
-    struct domain *d = ed->domain;
-    unsigned long pa;
-
-#ifdef CONFIG_VMX
-    if ( unlikely(d->arch.shadow_mode) )
-        pa = ((d->arch.shadow_mode == SHM_full_32) ?
-              pagetable_val(ed->arch.monitor_table) :
-              pagetable_val(ed->arch.shadow_table));
-    else
-        pa = pagetable_val(ed->arch.pagetable);
-#else
-    if ( unlikely(d->arch.shadow_mode) )
-        pa = pagetable_val(ed->arch.shadow_table);    
-    else
-        pa = pagetable_val(ed->arch.pagetable);
-#endif
-
-    write_cr3(pa);
-}
-
-static void __invalidate_shadow_ldt(struct exec_domain *d)
-{
-    int i;
-    unsigned long pfn;
-    struct pfn_info *page;
-    
-    d->arch.shadow_ldt_mapcnt = 0;
-
-    for ( i = 16; i < 32; i++ )
-    {
-        pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
-        if ( pfn == 0 ) continue;
-        d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
-        page = &frame_table[pfn];
-        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
-        ASSERT_PAGE_IS_DOMAIN(page, d->domain);
-        put_page_and_type(page);
-    }
-
-    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
-    percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
-}
-
-
-static inline void invalidate_shadow_ldt(struct exec_domain *d)
-{
-    if ( d->arch.shadow_ldt_mapcnt != 0 )
-        __invalidate_shadow_ldt(d);
-}
-
-
-static int alloc_segdesc_page(struct pfn_info *page)
-{
-    struct desc_struct *descs;
-    int i;
-
-    descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
-
-    for ( i = 0; i < 512; i++ )
-        if ( unlikely(!check_descriptor(&descs[i])) )
-            goto fail;
-
-    unmap_domain_mem(descs);
-    return 1;
-
- fail:
-    unmap_domain_mem(descs);
-    return 0;
-}
-
-
-/* Map shadow page at offset @off. */
-int map_ldt_shadow_page(unsigned int off)
-{
-    struct exec_domain *ed = current;
-    struct domain *d = ed->domain;
-    unsigned long l1e;
-
-    if ( unlikely(in_irq()) )
-        BUG();
-
-    __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >> 
-                                                       PAGE_SHIFT) + off]);
-
-    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
-         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
-                                     d, PGT_ldt_page)) )
-        return 0;
-
-    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
-    ed->arch.shadow_ldt_mapcnt++;
-
-    return 1;
-}
-
-
-static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
-{
-    struct pfn_info *page = &frame_table[page_nr];
-
-    if ( unlikely(!pfn_is_ram(page_nr)) )
-    {
-        MEM_LOG("Pfn %08lx is not RAM", page_nr);
-        return 0;
-    }
-
-    if ( unlikely(!get_page(page, d)) )
-    {
-        MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
-        return 0;
-    }
-
-    return 1;
-}
-
-
-static int get_page_and_type_from_pagenr(unsigned long page_nr, 
-                                         u32 type,
-                                         struct domain *d)
-{
-    struct pfn_info *page = &frame_table[page_nr];
-
-    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
-        return 0;
-
-    if ( unlikely(!get_page_type(page, type)) )
-    {
-#ifdef VERBOSE
-        if ( (type & PGT_type_mask) != PGT_l1_page_table )
-            MEM_LOG("Bad page type for pfn %08lx (%08x)", 
-                    page_nr, page->u.inuse.type_info);
-#endif
-        put_page(page);
-        return 0;
-    }
-
-    return 1;
-}
-
-
-/*
- * We allow an L2 tables to map each other (a.k.a. linear page tables). It
- * needs some special care with reference counst and access permissions:
- *  1. The mapping entry must be read-only, or the guest may get write access
- *     to its own PTEs.
- *  2. We must only bump the reference counts for an *already validated*
- *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
- *     on a validation that is required to complete that validation.
- *  3. We only need to increment the reference counts for the mapped page
- *     frame if it is mapped by a different L2 table. This is sufficient and
- *     also necessary to allow validation of an L2 table mapping itself.
- */
-static int 
-get_linear_pagetable(
-    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
-{
-    u32 x, y;
-    struct pfn_info *page;
-
-    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
-    {
-        MEM_LOG("Attempt to create linear p.t. with write perms");
-        return 0;
-    }
-
-    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
-    {
-        /* Make sure the mapped frame belongs to the correct domain. */
-        if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
-            return 0;
-
-        /*
-         * Make sure that the mapped frame is an already-validated L2 table. 
-         * If so, atomically increment the count (checking for overflow).
-         */
-        page = &frame_table[l2_pgentry_to_pagenr(l2e)];
-        y = page->u.inuse.type_info;
-        do {
-            x = y;
-            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
-                 unlikely((x & (PGT_type_mask|PGT_validated)) != 
-                          (PGT_l2_page_table|PGT_validated)) )
-            {
-                put_page(page);
-                return 0;
-            }
-        }
-        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
-    }
-
-    return 1;
-}
-
-
-static int
-get_page_from_l1e(
-    l1_pgentry_t l1e, struct domain *d)
-{
-    unsigned long l1v = l1_pgentry_val(l1e);
-    unsigned long pfn = l1_pgentry_to_pagenr(l1e);
-    struct pfn_info *page = &frame_table[pfn];
-    extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
-
-    if ( !(l1v & _PAGE_PRESENT) )
-        return 1;
-
-    if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
-    {
-        MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
-        return 0;
-    }
-
-    if ( unlikely(!pfn_is_ram(pfn)) )
-    {
-        /* Revert to caller privileges if FD == DOMID_IO. */
-        if ( d == dom_io )
-            d = current->domain;
-
-        if ( IS_PRIV(d) )
-            return 1;
-
-        if ( IS_CAPABLE_PHYSDEV(d) )
-            return domain_iomem_in_pfn(d, pfn);
-
-        MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
-        return 0;
-    }
-
-    return ((l1v & _PAGE_RW) ?
-            get_page_and_type(page, d, PGT_writable_page) :
-            get_page(page, d));
-}
-
-
-/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
-static int 
-get_page_from_l2e(
-    l2_pgentry_t l2e, unsigned long pfn,
-    struct domain *d, unsigned long va_idx)
-{
-    int rc;
-
-    if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
-        return 1;
-
-    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
-    {
-        MEM_LOG("Bad L2 page type settings %04lx",
-                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
-        return 0;
-    }
-
-    rc = get_page_and_type_from_pagenr(
-        l2_pgentry_to_pagenr(l2e), 
-        PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
-
-    if ( unlikely(!rc) )
-        return get_linear_pagetable(l2e, pfn, d);
-
-    return 1;
-}
-
-
-static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
-    unsigned long    l1v  = l1_pgentry_val(l1e);
-    unsigned long    pfn  = l1_pgentry_to_pagenr(l1e);
-    struct pfn_info *page = &frame_table[pfn];
-    struct domain   *e;
-
-    if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
-        return;
-
-    e = page_get_owner(page);
-    if ( unlikely(e != d) )
-    {
-        /*
-         * Unmap a foreign page that may have been mapped via a grant table.
-         * Note that this can fail for a privileged domain that can map foreign
-         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
-         * counted via a grant entry and some counted directly in the page
-         * structure's reference count. Note that reference counts won't get
-         * dangerously confused as long as we always try to decrement the
-         * grant entry first. We may end up with a mismatch between which
-         * mappings and which unmappings are counted via the grant entry, but
-         * really it doesn't matter as privileged domains have carte blanche.
-         */
-        if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
-            return;
-        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
-    }
-
-    if ( l1v & _PAGE_RW )
-    {
-        put_page_and_type(page);
-    }
-    else
-    {
-        /* We expect this is rare so we blow the entire shadow LDT. */
-        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
-                       PGT_ldt_page)) &&
-             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
-            invalidate_shadow_ldt(e->exec_domain[0]);
-        put_page(page);
-    }
-}
-
-
-/*
- * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
- * Note also that this automatically deals correctly with linear p.t.'s.
- */
-static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
-{
-    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
-         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
-        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
-}
-
-
-static int alloc_l2_table(struct pfn_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long  page_nr = page_to_pfn(page);
-    l2_pgentry_t  *pl2e;
-    int            i;
-   
-    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
-
-    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
-            goto fail;
-
-#if defined(__i386__)
-    /* Now we add our private high mappings. */
-    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 
-                      __PAGE_HYPERVISOR);
-#endif
-
-    unmap_domain_mem(pl2e);
-    return 1;
-
- fail:
-    while ( i-- > 0 )
-        put_page_from_l2e(pl2e[i], page_nr);
-
-    unmap_domain_mem(pl2e);
-    return 0;
-}
-
-
-static int alloc_l1_table(struct pfn_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long  page_nr = page_to_pfn(page);
-    l1_pgentry_t  *pl1e;
-    int            i;
-
-    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
-
-    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-        if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
-            goto fail;
-
-    unmap_domain_mem(pl1e);
-    return 1;
-
- fail:
-    while ( i-- > 0 )
-        put_page_from_l1e(pl1e[i], d);
-
-    unmap_domain_mem(pl1e);
-    return 0;
-}
-
-
-static void free_l2_table(struct pfn_info *page)
-{
-    unsigned long page_nr = page - frame_table;
-    l2_pgentry_t *pl2e;
-    int i;
-
-    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
-
-    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-        put_page_from_l2e(pl2e[i], page_nr);
-
-    unmap_domain_mem(pl2e);
-}
-
-
-static void free_l1_table(struct pfn_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long page_nr = page - frame_table;
-    l1_pgentry_t *pl1e;
-    int i;
-
-    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
-
-    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-        put_page_from_l1e(pl1e[i], d);
-
-    unmap_domain_mem(pl1e);
-}
-
-
-static inline int update_l2e(l2_pgentry_t *pl2e, 
-                             l2_pgentry_t  ol2e, 
-                             l2_pgentry_t  nl2e)
-{
-    unsigned long o = cmpxchg((unsigned long *)pl2e, 
-                              l2_pgentry_val(ol2e), 
-                              l2_pgentry_val(nl2e));
-    if ( o != l2_pgentry_val(ol2e) )
-        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
-                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
-    return (o == l2_pgentry_val(ol2e));
-}
-
-
-/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
-static int mod_l2_entry(l2_pgentry_t *pl2e, 
-                        l2_pgentry_t nl2e, 
-                        unsigned long pfn)
-{
-    l2_pgentry_t ol2e;
-    unsigned long _ol2e;
-
-    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
-                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
-    {
-        MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
-        return 0;
-    }
-
-    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
-        return 0;
-    ol2e = mk_l2_pgentry(_ol2e);
-
-    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
-    {
-        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
-        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
-            return update_l2e(pl2e, ol2e, nl2e);
-
-        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
-                                        ((unsigned long)pl2e & 
-                                         ~PAGE_MASK) >> 2)) )
-            return 0;
-
-        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
-        {
-            put_page_from_l2e(nl2e, pfn);
-            return 0;
-        }
-        
-        put_page_from_l2e(ol2e, pfn);
-        return 1;
-    }
-
-    if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
-        return 0;
-
-    put_page_from_l2e(ol2e, pfn);
-    return 1;
-}
-
-
-static inline int update_l1e(l1_pgentry_t *pl1e, 
-                             l1_pgentry_t  ol1e, 
-                             l1_pgentry_t  nl1e)
-{
-    unsigned long o = l1_pgentry_val(ol1e);
-    unsigned long n = l1_pgentry_val(nl1e);
-
-    if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
-         unlikely(o != l1_pgentry_val(ol1e)) )
-    {
-        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
-                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
-        return 0;
-    }
-
-    return 1;
-}
-
-
-/* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
-{
-    l1_pgentry_t ol1e;
-    unsigned long _ol1e;
-    struct domain *d = current->domain;
-
-    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
-    {
-        MEM_LOG("Bad get_user\n");
-        return 0;
-    }
-    
-    ol1e = mk_l1_pgentry(_ol1e);
-
-    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
-    {
-        /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
-        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
-            return update_l1e(pl1e, ol1e, nl1e);
-
-        if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
-            return 0;
-        
-        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
-        {
-            put_page_from_l1e(nl1e, d);
-            return 0;
-        }
-        
-        put_page_from_l1e(ol1e, d);
-        return 1;
-    }
-
-    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
-        return 0;
-    
-    put_page_from_l1e(ol1e, d);
-    return 1;
-}
-
-
-int alloc_page_type(struct pfn_info *page, unsigned int type)
-{
-    switch ( type )
-    {
-    case PGT_l1_page_table:
-        return alloc_l1_table(page);
-    case PGT_l2_page_table:
-        return alloc_l2_table(page);
-    case PGT_gdt_page:
-    case PGT_ldt_page:
-        return alloc_segdesc_page(page);
-    default:
-        printk("Bad type in alloc_page_type %x t=%x c=%x\n", 
-               type, page->u.inuse.type_info,
-               page->count_info);
-        BUG();
-    }
-
-    return 0;
-}
-
-
-void free_page_type(struct pfn_info *page, unsigned int type)
-{
-    struct domain *d = page_get_owner(page);
-
-    switch ( type )
-    {
-    case PGT_l1_page_table:
-        free_l1_table(page);
-        break;
-
-    case PGT_l2_page_table:
-        free_l2_table(page);
-        break;
-
-    default:
-        BUG();
-    }
-
-    if ( unlikely(d->arch.shadow_mode) && 
-         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
-    {
-        unshadow_table(page_to_pfn(page), type);
-        put_shadow_status(d);
-    }
-}
-
-
-void put_page_type(struct pfn_info *page)
-{
-    u32 nx, x, y = page->u.inuse.type_info;
-
- again:
-    do {
-        x  = y;
-        nx = x - 1;
-
-        ASSERT((x & PGT_count_mask) != 0);
-
-        /*
-         * The page should always be validated while a reference is held. The 
-         * exception is during domain destruction, when we forcibly invalidate 
-         * page-table pages if we detect a referential loop.
-         * See domain.c:relinquish_list().
-         */
-        ASSERT((x & PGT_validated) || 
-               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
-
-        if ( unlikely((nx & PGT_count_mask) == 0) )
-        {
-            /* Record TLB information for flush later. Races are harmless. */
-            page->tlbflush_timestamp = tlbflush_current_time();
-            
-            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
-                 likely(nx & PGT_validated) )
-            {
-                /*
-                 * Page-table pages must be unvalidated when count is zero. The
-                 * 'free' is safe because the refcnt is non-zero and validated
-                 * bit is clear => other ops will spin or fail.
-                 */
-                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
-                                           x & ~PGT_validated)) != x) )
-                    goto again;
-                /* We cleared the 'valid bit' so we do the clear up. */
-                free_page_type(page, x & PGT_type_mask);
-                /* Carry on, but with the 'valid bit' now clear. */
-                x  &= ~PGT_validated;
-                nx &= ~PGT_validated;
-            }
-        }
-        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
-                           (PGT_pinned | 1)) )
-        {
-            /* Page is now only pinned. Make the back pointer mutable again. */
-            nx |= PGT_va_mutable;
-        }
-    }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
-}
-
-
-int get_page_type(struct pfn_info *page, u32 type)
-{
-    u32 nx, x, y = page->u.inuse.type_info;
-
- again:
-    do {
-        x  = y;
-        nx = x + 1;
-        if ( unlikely((nx & PGT_count_mask) == 0) )
-        {
-            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
-            return 0;
-        }
-        else if ( unlikely((x & PGT_count_mask) == 0) )
-        {
-            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
-            {
-                /*
-                 * On type change we check to flush stale TLB entries. This 
-                 * may be unnecessary (e.g., page was GDT/LDT) but those
-                 * circumstances should be very rare.
-                 */
-                struct domain *d = page_get_owner(page);
-                if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
-                                         page->tlbflush_timestamp)) )
-                {
-                    perfc_incr(need_flush_tlb_flush);
-                    flush_tlb_cpu(d->exec_domain[0]->processor);
-                }
-
-                /* We lose existing type, back pointer, and validity. */
-                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
-                nx |= type;
-
-                /* No special validation needed for writable pages. */
-                /* Page tables and GDT/LDT need to be scanned for validity. */
-                if ( type == PGT_writable_page )
-                    nx |= PGT_validated;
-            }
-        }
-        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
-        {
-            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
-            {
-                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
-                     ((type & PGT_type_mask) != PGT_l1_page_table) )
-                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
-                            x & PGT_type_mask, type, page_to_pfn(page));
-                return 0;
-            }
-            else if ( (x & PGT_va_mask) == PGT_va_mutable )
-            {
-                /* The va backpointer is mutable, hence we update it. */
-                nx &= ~PGT_va_mask;
-                nx |= type; /* we know the actual type is correct */
-            }
-            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
-            {
-                /* This table is potentially mapped at multiple locations. */
-                nx &= ~PGT_va_mask;
-                nx |= PGT_va_unknown;
-            }
-        }
-        else if ( unlikely(!(x & PGT_validated)) )
-        {
-            /* Someone else is updating validation of this page. Wait... */
-            while ( (y = page->u.inuse.type_info) == x )
-            {
-                rep_nop();
-                barrier();
-            }
-            goto again;
-        }
-    }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
-
-    if ( unlikely(!(nx & PGT_validated)) )
-    {
-        /* Try to validate page type; drop the new reference on failure. */
-        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
-        {
-            MEM_LOG("Error while validating pfn %08lx for type %08x."
-                    " caf=%08x taf=%08x\n",
-                    page_to_pfn(page), type,
-                    page->count_info,
-                    page->u.inuse.type_info);
-            /* Noone else can get a reference. We hold the only ref. */
-            page->u.inuse.type_info = 0;
-            return 0;
-        }
-
-        /* Noone else is updating simultaneously. */
-        __set_bit(_PGT_validated, &page->u.inuse.type_info);
-    }
-
-    return 1;
-}
-
-
-int new_guest_cr3(unsigned long pfn)
-{
-    struct exec_domain *ed = current;
-    struct domain *d = ed->domain;
-    int okay, cpu = smp_processor_id();
-    unsigned long old_base_pfn;
-    
-    okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
-    if ( likely(okay) )
-    {
-        invalidate_shadow_ldt(ed);
-
-        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
-        old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
-        ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
-
-        shadow_mk_pagetable(ed);
-
-        write_ptbase(ed);
-
-        put_page_and_type(&frame_table[old_base_pfn]);
-    }
-    else
-    {
-        MEM_LOG("Error while installing new baseptr %08lx", pfn);
-    }
-
-    return okay;
-}
-
-static int do_extended_command(unsigned long ptr, unsigned long val)
-{
-    int okay = 1, cpu = smp_processor_id();
-    unsigned int cmd = val & MMUEXT_CMD_MASK;
-    unsigned long pfn = ptr >> PAGE_SHIFT;
-    struct pfn_info *page = &frame_table[pfn];
-    struct exec_domain *ed = current;
-    struct domain *d = ed->domain, *nd, *e;
-    u32 x, y;
-    domid_t domid;
-    grant_ref_t gntref;
-
-    switch ( cmd )
-    {
-    case MMUEXT_PIN_L1_TABLE:
-    case MMUEXT_PIN_L2_TABLE:
-        /*
-         * We insist that, if you pin an L1 page, it's the first thing that
-         * you do to it. This is because we require the backptr to still be
-         * mutable. This assumption seems safe.
-         */
-        okay = get_page_and_type_from_pagenr(
-            pfn, 
-            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
-             PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
-            FOREIGNDOM);
-
-        if ( unlikely(!okay) )
-        {
-            MEM_LOG("Error while pinning pfn %08lx", pfn);
-            break;
-        }
-
-        if ( unlikely(test_and_set_bit(_PGT_pinned,
-                                       &page->u.inuse.type_info)) )
-        {
-            MEM_LOG("Pfn %08lx already pinned", pfn);
-            put_page_and_type(page);
-            okay = 0;
-            break;
-        }
-
-        break;
-
-    case MMUEXT_UNPIN_TABLE:
-        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
-        {
-            MEM_LOG("Page %08lx bad domain (dom=%p)",
-                    ptr, page_get_owner(page));
-        }
-        else if ( likely(test_and_clear_bit(_PGT_pinned, 
-                                            &page->u.inuse.type_info)) )
-        {
-            put_page_and_type(page);
-            put_page(page);
-        }
-        else
-        {
-            okay = 0;
-            put_page(page);
-            MEM_LOG("Pfn %08lx not pinned", pfn);
-        }
-        break;
-
-    case MMUEXT_NEW_BASEPTR:
-        okay = new_guest_cr3(pfn);
-        break;
-        
-    case MMUEXT_TLB_FLUSH:
-        percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
-        break;
-    
-    case MMUEXT_INVLPG:
-        __flush_tlb_one(ptr);
-        break;
-
-    case MMUEXT_FLUSH_CACHE:
-        if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
-        {
-            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
-            okay = 0;
-        }
-        else
-        {
-            wbinvd();
-        }
-        break;
-
-    case MMUEXT_SET_LDT:
-    {
-        unsigned long ents = val >> MMUEXT_CMD_SHIFT;
-        if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
-             (ents > 8192) ||
-             ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
-             ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
-        {
-            okay = 0;
-            MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
-        }
-        else if ( (ed->arch.ldt_ents != ents) || 
-                  (ed->arch.ldt_base != ptr) )
-        {
-            invalidate_shadow_ldt(ed);
-            ed->arch.ldt_base = ptr;
-            ed->arch.ldt_ents = ents;
-            load_LDT(ed);
-            percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
-            if ( ents != 0 )
-                percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
-        }
-        break;
-    }
-
-    case MMUEXT_SET_FOREIGNDOM:
-        domid = (domid_t)(val >> 16);
-
-        if ( (e = percpu_info[cpu].foreign) != NULL )
-            put_domain(e);
-        percpu_info[cpu].foreign = NULL;
-
-        if ( !IS_PRIV(d) )
-        {
-            switch ( domid )
-            {
-            case DOMID_IO:
-                get_knownalive_domain(dom_io);
-                percpu_info[cpu].foreign = dom_io;
-                break;
-            default:
-                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
-                okay = 0;
-                break;
-            }
-        }
-        else
-        {
-            percpu_info[cpu].foreign = e = find_domain_by_id(domid);
-            if ( e == NULL )
-            {
-                switch ( domid )
-                {
-                case DOMID_XEN:
-                    get_knownalive_domain(dom_xen);
-                    percpu_info[cpu].foreign = dom_xen;
-                    break;
-                case DOMID_IO:
-                    get_knownalive_domain(dom_io);
-                    percpu_info[cpu].foreign = dom_io;
-                    break;
-                default:
-                    MEM_LOG("Unknown domain '%u'", domid);
-                    okay = 0;
-                    break;
-                }
-            }
-        }
-        break;
-
-    case MMUEXT_TRANSFER_PAGE:
-        domid  = (domid_t)(val >> 16);
-        gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
-        
-        if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
-             unlikely(!pfn_is_ram(pfn)) ||
-             unlikely((e = find_domain_by_id(domid)) == NULL) )
-        {
-            MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
-            okay = 0;
-            break;
-        }
-
-        spin_lock(&d->page_alloc_lock);
-
-        /*
-         * The tricky bit: atomically release ownership while there is just one
-         * benign reference to the page (PGC_allocated). If that reference
-         * disappears then the deallocation routine will safely spin.
-         */
-        nd = page_get_owner(page);
-        y  = page->count_info;
-        do {
-            x = y;
-            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
-                          (1|PGC_allocated)) ||
-                 unlikely(nd != d) )
-            {
-                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
-                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
-                        d, d->id, nd, x, page->u.inuse.type_info);
-                spin_unlock(&d->page_alloc_lock);
-                put_domain(e);
-                return 0;
-            }
-            __asm__ __volatile__(
-                LOCK_PREFIX "cmpxchg8b %2"
-                : "=d" (nd), "=a" (y),
-                "=m" (*(volatile u64 *)(&page->count_info))
-                : "0" (d), "1" (x), "c" (NULL), "b" (x) );
-        } 
-        while ( unlikely(nd != d) || unlikely(y != x) );
-
-        /*
-         * Unlink from 'd'. At least one reference remains (now anonymous), so
-         * noone else is spinning to try to delete this page from 'd'.
-         */
-        d->tot_pages--;
-        list_del(&page->list);
-        
-        spin_unlock(&d->page_alloc_lock);
-
-        spin_lock(&e->page_alloc_lock);
-
-        /*
-         * Check that 'e' will accept the page and has reservation headroom.
-         * Also, a domain mustn't have PGC_allocated pages when it is dying.
-         */
-        ASSERT(e->tot_pages <= e->max_pages);
-        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
-             unlikely(e->tot_pages == e->max_pages) ||
-             unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
-        {
-            MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
-                    "provided a bad grant ref, or is dying (%08lx).\n",
-                    e->tot_pages, e->max_pages, e->d_flags);
-            spin_unlock(&e->page_alloc_lock);
-            put_domain(e);
-            okay = 0;
-            break;
-        }
-
-        /* Okay, add the page to 'e'. */
-        if ( unlikely(e->tot_pages++ == 0) )
-            get_knownalive_domain(e);
-        list_add_tail(&page->list, &e->page_list);
-        page_set_owner(page, e);
-
-        spin_unlock(&e->page_alloc_lock);
-
-        /* Transfer is all done: tell the guest about its new page frame. */
-        gnttab_notify_transfer(e, gntref, pfn);
-        
-        put_domain(e);
-        break;
-
-    case MMUEXT_REASSIGN_PAGE:
-        if ( unlikely(!IS_PRIV(d)) )
-        {
-            MEM_LOG("Dom %u has no reassignment priv", d->id);
-            okay = 0;
-            break;
-        }
-
-        e = percpu_info[cpu].foreign;
-        if ( unlikely(e == NULL) )
-        {
-            MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
-            okay = 0;
-            break;
-        }
-
-        /*
-         * Grab both page_list locks, in order. This prevents the page from
-         * disappearing elsewhere while we modify the owner, and we'll need
-         * both locks if we're successful so that we can change lists.
-         */
-        if ( d < e )
-        {
-            spin_lock(&d->page_alloc_lock);
-            spin_lock(&e->page_alloc_lock);
-        }
-        else
-        {
-            spin_lock(&e->page_alloc_lock);
-            spin_lock(&d->page_alloc_lock);
-        }
-
-        /* A domain shouldn't have PGC_allocated pages when it is dying. */
-        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
-             unlikely(IS_XEN_HEAP_FRAME(page)) )
-        {
-            MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
-            okay = 0;
-            goto reassign_fail;
-        }
-
-        /*
-         * The tricky bit: atomically change owner while there is just one
-         * benign reference to the page (PGC_allocated). If that reference
-         * disappears then the deallocation routine will safely spin.
-         */
-        nd = page_get_owner(page);
-        y  = page->count_info;
-        do {
-            x = y;
-            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
-                          (1|PGC_allocated)) ||
-                 unlikely(nd != d) )
-            {
-                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
-                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
-                        d, d->id, nd, x, page->u.inuse.type_info);
-                okay = 0;
-                goto reassign_fail;
-            }
-            __asm__ __volatile__(
-                LOCK_PREFIX "cmpxchg8b %3"
-                : "=d" (nd), "=a" (y), "=c" (e),
-                "=m" (*(volatile u64 *)(&page->count_info))
-                : "0" (d), "1" (x), "c" (e), "b" (x) );
-        } 
-        while ( unlikely(nd != d) || unlikely(y != x) );
-        
-        /*
-         * Unlink from 'd'. We transferred at least one reference to 'e', so
-         * noone else is spinning to try to delete this page from 'd'.
-         */
-        d->tot_pages--;
-        list_del(&page->list);
-        
-        /*
-         * Add the page to 'e'. Someone may already have removed the last
-         * reference and want to remove the page from 'e'. However, we have
-         * the lock so they'll spin waiting for us.
-         */
-        if ( unlikely(e->tot_pages++ == 0) )
-            get_knownalive_domain(e);
-        list_add_tail(&page->list, &e->page_list);
-
-    reassign_fail:        
-        spin_unlock(&d->page_alloc_lock);
-        spin_unlock(&e->page_alloc_lock);
-        break;
-
-    case MMUEXT_CLEAR_FOREIGNDOM:
-        if ( (e = percpu_info[cpu].foreign) != NULL )
-            put_domain(e);
-        percpu_info[cpu].foreign = NULL;
-        break;
-
-    default:
-        MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
-        okay = 0;
-        break;
-    }
-
-    return okay;
-}
-
-int do_mmu_update(
-    mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
-{
-/*
- * We steal the m.s.b. of the @count parameter to indicate whether this
- * invocation of do_mmu_update() is resuming a previously preempted call.
- * We steal the next 15 bits to remember the current FOREIGNDOM.
- */
-#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
-#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
-#define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
-
-    mmu_update_t req;
-    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
-    struct pfn_info *page;
-    int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
-    unsigned int cmd, done = 0;
-    unsigned long prev_spfn = 0;
-    l1_pgentry_t *prev_spl1e = 0;
-    struct exec_domain *ed = current;
-    struct domain *d = ed->domain;
-    u32 type_info;
-    domid_t domid;
-
-    LOCK_BIGLOCK(d);
-
-    cleanup_writable_pagetable(d);
-
-    /*
-     * If we are resuming after preemption, read how much work we have already
-     * done. This allows us to set the @done output parameter correctly.
-     * We also reset FOREIGNDOM here.
-     */
-    if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
-    {
-        if ( !(count & MMU_UPDATE_PREEMPTED) )
-        {
-            /* Count overflow into private FOREIGNDOM field. */
-            MEM_LOG("do_mmu_update count is too large");
-            rc = -EINVAL;
-            goto out;
-        }
-        count &= ~MMU_UPDATE_PREEMPTED;
-        domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
-        count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
-        if ( unlikely(pdone != NULL) )
-            (void)get_user(done, pdone);
-        if ( (domid != current->domain->id) &&
-             !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
-        {
-            rc = -EINVAL;
-            goto out;
-        }
-    }
-
-    perfc_incrc(calls_to_mmu_update); 
-    perfc_addc(num_page_updates, count);
-
-    if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
-    {
-        rc = -EFAULT;
-        goto out;
-    }
-
-    for ( i = 0; i < count; i++ )
-    {
-        if ( hypercall_preempt_check() )
-        {
-            rc = hypercall3_create_continuation(
-                __HYPERVISOR_mmu_update, ureqs, 
-                (count - i) |
-                (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 
-                MMU_UPDATE_PREEMPTED, pdone);
-            break;
-        }
-
-        if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
-        {
-            MEM_LOG("Bad __copy_from_user");
-            rc = -EFAULT;
-            break;
-        }
-
-        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
-        pfn = req.ptr >> PAGE_SHIFT;
-
-        okay = 0;
-
-        switch ( cmd )
-        {
-            /*
-             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
-             */
-        case MMU_NORMAL_PT_UPDATE:
-            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
-            {
-                MEM_LOG("Could not get page for normal update");
-                break;
-            }
-
-            if ( likely(prev_pfn == pfn) )
-            {
-                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
-            }
-            else
-            {
-                if ( prev_pfn != 0 )
-                    unmap_domain_mem((void *)va);
-                va = (unsigned long)map_domain_mem(req.ptr);
-                prev_pfn = pfn;
-            }
-
-            page = &frame_table[pfn];
-            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
-            {
-            case PGT_l1_page_table: 
-                if ( likely(get_page_type(
-                    page, type_info & (PGT_type_mask|PGT_va_mask))) )
-                {
-                    okay = mod_l1_entry((l1_pgentry_t *)va, 
-                                        mk_l1_pgentry(req.val)); 
-
-                    if ( unlikely(d->arch.shadow_mode) && okay &&
-                         (get_shadow_status(d, page-frame_table) &
-                          PSH_shadowed) )
-                    {
-                        shadow_l1_normal_pt_update(
-                            req.ptr, req.val, &prev_spfn, &prev_spl1e);
-                        put_shadow_status(d);
-                    }
-
-                    put_page_type(page);
-                }
-                break;
-            case PGT_l2_page_table:
-                if ( likely(get_page_type(page, PGT_l2_page_table)) )
-                {
-                    okay = mod_l2_entry((l2_pgentry_t *)va, 
-                                        mk_l2_pgentry(req.val),
-                                        pfn); 
-
-                    if ( unlikely(d->arch.shadow_mode) && okay &&
-                         (get_shadow_status(d, page-frame_table) & 
-                          PSH_shadowed) )
-                    {
-                        shadow_l2_normal_pt_update(req.ptr, req.val);
-                        put_shadow_status(d);
-                    }
-
-                    put_page_type(page);
-                }
-                break;
-            default:
-                if ( likely(get_page_type(page, PGT_writable_page)) )
-                {
-                    *(unsigned long *)va = req.val;
-                    okay = 1;
-                    put_page_type(page);
-                }
-                break;
-            }
-
-            put_page(page);
-            break;
-
-        case MMU_MACHPHYS_UPDATE:
-            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
-            {
-                MEM_LOG("Could not get page for mach->phys update");
-                break;
-            }
-
-            machine_to_phys_mapping[pfn] = req.val;
-            okay = 1;
-
-            /*
-             * If in log-dirty mode, mark the corresponding pseudo-physical
-             * page as dirty.
-             */
-            if ( unlikely(d->arch.shadow_mode == SHM_logdirty) && 
-                 mark_dirty(d, pfn) )
-                d->arch.shadow_dirty_block_count++;
-
-            put_page(&frame_table[pfn]);
-            break;
-
-            /*
-             * MMU_EXTENDED_COMMAND: Extended command is specified
-             * in the least-siginificant bits of the 'value' field.
-             */
-        case MMU_EXTENDED_COMMAND:
-            req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
-            okay = do_extended_command(req.ptr, req.val);
-            break;
-
-        default:
-            MEM_LOG("Invalid page update command %08lx", req.ptr);
-            break;
-        }
-
-        if ( unlikely(!okay) )
-        {
-            rc = -EINVAL;
-            break;
-        }
-
-        ureqs++;
-    }
-
- out:
-    if ( prev_pfn != 0 )
-        unmap_domain_mem((void *)va);
-
-    if ( unlikely(prev_spl1e != 0) ) 
-        unmap_domain_mem((void *)prev_spl1e);
-
-    deferred_ops = percpu_info[cpu].deferred_ops;
-    percpu_info[cpu].deferred_ops = 0;
-
-    if ( deferred_ops & DOP_FLUSH_TLB )
-        local_flush_tlb();
-        
-    if ( deferred_ops & DOP_RELOAD_LDT )
-        (void)map_ldt_shadow_page(0);
-
-    if ( unlikely(percpu_info[cpu].foreign != NULL) )
-    {
-        put_domain(percpu_info[cpu].foreign);
-        percpu_info[cpu].foreign = NULL;
-    }
-
-    /* Add incremental work we have done to the @done output parameter. */
-    if ( unlikely(pdone != NULL) )
-        __put_user(done + i, pdone);
-
-    UNLOCK_BIGLOCK(d);
-    return rc;
-}
-
-
-int do_update_va_mapping(unsigned long page_nr, 
-                         unsigned long val, 
-                         unsigned long flags)
-{
-    struct exec_domain *ed = current;
-    struct domain *d = ed->domain;
-    int err = 0;
-    unsigned int cpu = ed->processor;
-    unsigned long deferred_ops;
-
-    perfc_incrc(calls_to_update_va);
-
-    if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
-        return -EINVAL;
-
-    LOCK_BIGLOCK(d);
-
-    cleanup_writable_pagetable(d);
-
-    /*
-     * XXX When we make this support 4MB superpages we should also deal with 
-     * the case of updating L2 entries.
-     */
-
-    if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 
-                                mk_l1_pgentry(val))) )
-        err = -EINVAL;
-
-    if ( unlikely(d->arch.shadow_mode) )
-    {
-        unsigned long sval;
-
-        l1pte_propagate_from_guest(d, &val, &sval);
-
-        if ( unlikely(__put_user(sval, ((unsigned long *)(
-            &shadow_linear_pg_table[page_nr])))) )
-        {
-            /*
-             * Since L2's are guranteed RW, failure indicates the page was not 
-             * shadowed, so ignore.
-             */
-            perfc_incrc(shadow_update_va_fail);
-        }
-
-        /*
-         * If we're in log-dirty mode then we need to note that we've updated
-         * the PTE in the PT-holding page. We need the machine frame number
-         * for this.
-         */
-        if ( d->arch.shadow_mode == SHM_logdirty )
-            mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT));  
-  
-        check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
-    }
-
-    deferred_ops = percpu_info[cpu].deferred_ops;
-    percpu_info[cpu].deferred_ops = 0;
-
-    if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
-         unlikely(flags & UVMF_FLUSH_TLB) )
-        local_flush_tlb();
-    else if ( unlikely(flags & UVMF_INVLPG) )
-        __flush_tlb_one(page_nr << PAGE_SHIFT);
-
-    if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
-        (void)map_ldt_shadow_page(0);
-    
-    UNLOCK_BIGLOCK(d);
-
-    return err;
-}
-
-int do_update_va_mapping_otherdomain(unsigned long page_nr, 
-                                     unsigned long val, 
-                                     unsigned long flags,
-                                     domid_t domid)
-{
-    unsigned int cpu = smp_processor_id();
-    struct domain *d;
-    int rc;
-
-    if ( unlikely(!IS_PRIV(current->domain)) )
-        return -EPERM;
-
-    percpu_info[cpu].foreign = d = find_domain_by_id(domid);
-    if ( unlikely(d == NULL) )
-    {
-        MEM_LOG("Unknown domain '%u'", domid);
-        return -ESRCH;
-    }
-
-    rc = do_update_va_mapping(page_nr, val, flags);
-
-    put_domain(d);
-    percpu_info[cpu].foreign = NULL;
-
-    return rc;
-}
-
-
-
-/*************************
- * Descriptor Tables
- */
-
-void destroy_gdt(struct exec_domain *ed)
-{
-    int i;
-    unsigned long pfn;
-
-    for ( i = 0; i < 16; i++ )
-    {
-        if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
-            put_page_and_type(&frame_table[pfn]);
-        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
-    }
-}
-
-
-long set_gdt(struct exec_domain *ed, 
-             unsigned long *frames,
-             unsigned int entries)
-{
-    struct domain *d = ed->domain;
-    /* NB. There are 512 8-byte entries per GDT page. */
-    int i = 0, nr_pages = (entries + 511) / 512;
-    struct desc_struct *vgdt;
-    unsigned long pfn;
-
-    /* Check the first page in the new GDT. */
-    if ( (pfn = frames[0]) >= max_page )
-        goto fail;
-
-    /* The first page is special because Xen owns a range of entries in it. */
-    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
-    {
-        /* GDT checks failed: try zapping the Xen reserved entries. */
-        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
-            goto fail;
-        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
-        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
-               NR_RESERVED_GDT_ENTRIES*8);
-        unmap_domain_mem(vgdt);
-        put_page_and_type(&frame_table[pfn]);
-
-        /* Okay, we zapped the entries. Now try the GDT checks again. */
-        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
-            goto fail;
-    }
-
-    /* Check the remaining pages in the new GDT. */
-    for ( i = 1; i < nr_pages; i++ )
-        if ( ((pfn = frames[i]) >= max_page) ||
-             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
-            goto fail;
-
-    /* Copy reserved GDT entries to the new GDT. */
-    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
-    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
-           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
-           NR_RESERVED_GDT_ENTRIES*8);
-    unmap_domain_mem(vgdt);
-
-    /* Tear down the old GDT. */
-    destroy_gdt(ed);
-
-    /* Install the new GDT. */
-    for ( i = 0; i < nr_pages; i++ )
-        ed->arch.perdomain_ptes[i] =
-            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-
-    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
-    SET_GDT_ENTRIES(ed, entries);
-
-    return 0;
-
- fail:
-    while ( i-- > 0 )
-        put_page_and_type(&frame_table[frames[i]]);
-    return -EINVAL;
-}
-
-
-long do_set_gdt(unsigned long *frame_list, unsigned int entries)
-{
-    int nr_pages = (entries + 511) / 512;
-    unsigned long frames[16];
-    long ret;
-
-    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
-        return -EINVAL;
-    
-    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
-        return -EFAULT;
-
-    LOCK_BIGLOCK(current->domain);
-
-    if ( (ret = set_gdt(current, frames, entries)) == 0 )
-    {
-        local_flush_tlb();
-        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
-    }
-
-    UNLOCK_BIGLOCK(current->domain);
-
-    return ret;
-}
-
-
-long do_update_descriptor(
-    unsigned long pa, unsigned long word1, unsigned long word2)
-{
-    unsigned long pfn = pa >> PAGE_SHIFT;
-    struct desc_struct *gdt_pent, d;
-    struct pfn_info *page;
-    struct exec_domain *ed;
-    long ret = -EINVAL;
-
-    d.a = (u32)word1;
-    d.b = (u32)word2;
-
-    LOCK_BIGLOCK(current->domain);
-
-    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
-        UNLOCK_BIGLOCK(current->domain);
-        return -EINVAL;
-    }
-
-    page = &frame_table[pfn];
-    if ( unlikely(!get_page(page, current->domain)) ) {
-        UNLOCK_BIGLOCK(current->domain);
-        return -EINVAL;
-    }
-
-    /* Check if the given frame is in use in an unsafe context. */
-    switch ( page->u.inuse.type_info & PGT_type_mask )
-    {
-    case PGT_gdt_page:
-        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
-        for_each_exec_domain(current->domain, ed) {
-            if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) &&
-                 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
-                 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
-                goto out;
-        }
-        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
-            goto out;
-        break;
-    case PGT_ldt_page:
-        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
-            goto out;
-        break;
-    default:
-        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
-            goto out;
-        break;
-    }
-
-    /* All is good so make the update. */
-    gdt_pent = map_domain_mem(pa);
-    memcpy(gdt_pent, &d, 8);
-    unmap_domain_mem(gdt_pent);
-
-    put_page_type(page);
-
-    ret = 0; /* success */
-
- out:
-    put_page(page);
-
-    UNLOCK_BIGLOCK(current->domain);
-
-    return ret;
-}
-
-
-
-/*************************
- * Writable Pagetables
- */
-
-ptwr_info_t ptwr_info[NR_CPUS];
-
-#ifdef VERBOSE
-int ptwr_debug = 0x0;
-#define PTWR_PRINTK(_f, _a...) \
- do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
-#define PTWR_PRINT_WHICH (which ? 'I' : 'A')
-#else
-#define PTWR_PRINTK(_f, _a...) ((void)0)
-#endif
-
-/* Flush the given writable p.t. page and write-protect it again. */
-void ptwr_flush(const int which)
-{
-    unsigned long  sstat, spte, pte, *ptep, l1va;
-    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
-    l2_pgentry_t  *pl2e;
-    int            i, cpu = smp_processor_id();
-    struct exec_domain *ed = current;
-    struct domain *d = ed->domain;
-
-    l1va = ptwr_info[cpu].ptinfo[which].l1va;
-    ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
-
-    /*
-     * STEP 1. Write-protect the p.t. page so no more updates can occur.
-     */
-
-    if ( unlikely(__get_user(pte, ptep)) )
-    {
-        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
-        /*
-         * Really a bug. We could read this PTE during the initial fault,
-         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
-         */
-        BUG();
-    }
-    PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
-                PTWR_PRINT_WHICH, ptep, pte);
-    pte &= ~_PAGE_RW;
-
-    if ( unlikely(d->arch.shadow_mode) )
-    {
-        /* Write-protect the p.t. page in the shadow page table. */
-        l1pte_propagate_from_guest(d, &pte, &spte);
-        __put_user(
-            spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
-
-        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
-        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
-        if ( sstat & PSH_shadowed )
-            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
-    }
-
-    /* Write-protect the p.t. page in the guest page table. */
-    if ( unlikely(__put_user(pte, ptep)) )
-    {
-        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
-        /*
-         * Really a bug. We could write this PTE during the initial fault,
-         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
-         */
-        BUG();
-    }
-
-    /* Ensure that there are no stale writable mappings in any TLB. */
-    /* NB. INVLPG is a serialising instruction: flushes pending updates. */
-#if 1
-    __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
-#else
-    flush_tlb_all();
-#endif
-    PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
-                PTWR_PRINT_WHICH, ptep, pte);
-
-    /*
-     * STEP 2. Validate any modified PTEs.
-     */
-
-    pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
-    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-    {
-        ol1e = ptwr_info[cpu].ptinfo[which].page[i];
-        nl1e = pl1e[i];
-
-        if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
-            continue;
-
-        /*
-         * Fast path for PTEs that have merely been write-protected
-         * (e.g., during a Unix fork()). A strict reduction in privilege.
-         */
-        if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
-        {
-            if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
-            {
-                if ( unlikely(sl1e != NULL) )
-                    l1pte_propagate_from_guest(
-                        d, &l1_pgentry_val(nl1e), 
-                        &l1_pgentry_val(sl1e[i]));
-                put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
-            }
-            continue;
-        }
-
-        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
-        {
-            MEM_LOG("ptwr: Could not re-validate l1 page\n");
-            /*
-             * Make the remaining p.t's consistent before crashing, so the
-             * reference counts are correct.
-             */
-            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
-                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
-            unmap_domain_mem(pl1e);
-            ptwr_info[cpu].ptinfo[which].l1va = 0;
-            UNLOCK_BIGLOCK(d);
-            domain_crash();
-        }
-        
-        if ( unlikely(sl1e != NULL) )
-            l1pte_propagate_from_guest(
-                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
-
-        if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
-            put_page_from_l1e(ol1e, d);
-    }
-    unmap_domain_mem(pl1e);
-
-    /*
-     * STEP 3. Reattach the L1 p.t. page into the current address space.
-     */
-
-    if ( (which == PTWR_PT_ACTIVE) && likely(!d->arch.shadow_mode) )
-    {
-        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
-        *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
-    }
-
-    /*
-     * STEP 4. Final tidy-up.
-     */
-
-    ptwr_info[cpu].ptinfo[which].l1va = 0;
-
-    if ( unlikely(sl1e != NULL) )
-    {
-        unmap_domain_mem(sl1e);
-        put_shadow_status(d);
-    }
-}
-
-/* Write page fault handler: check if guest is trying to modify a PTE. */
-int ptwr_do_page_fault(unsigned long addr)
-{
-    unsigned long    pte, pfn, l2e;
-    struct pfn_info *page;
-    l2_pgentry_t    *pl2e;
-    int              which, cpu = smp_processor_id();
-    u32              l2_idx;
-
-    /*
-     * Attempt to read the PTE that maps the VA being accessed. By checking for
-     * PDE validity in the L2 we avoid many expensive fixups in __get_user().
-     */
-    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
-           _PAGE_PRESENT) ||
-         __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
-    {
-        return 0;
-    }
-
-    pfn  = pte >> PAGE_SHIFT;
-    page = &frame_table[pfn];
-
-    /* We are looking only for read-only mappings of p.t. pages. */
-    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
-         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
-    {
-        return 0;
-    }
-    
-    /* Get the L2 index at which this L1 p.t. is always mapped. */
-    l2_idx = page->u.inuse.type_info & PGT_va_mask;
-    if ( unlikely(l2_idx >= PGT_va_unknown) )
-    {
-        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
-    }
-    l2_idx >>= PGT_va_shift;
-
-    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
-    {
-        MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
-        domain_crash();
-    }
-
-    /*
-     * Is the L1 p.t. mapped into the current address space? If so we call it
-     * an ACTIVE p.t., otherwise it is INACTIVE.
-     */
-    pl2e = &linear_l2_table[l2_idx];
-    l2e  = l2_pgentry_val(*pl2e);
-    which = PTWR_PT_INACTIVE;
-    if ( (l2e >> PAGE_SHIFT) == pfn )
-    {
-        /* Check the PRESENT bit to set ACTIVE. */
-        if ( likely(l2e & _PAGE_PRESENT) )
-            which = PTWR_PT_ACTIVE;
-        else {
-            /*
-             * If the PRESENT bit is clear, we may be conflicting with
-             * the current ACTIVE p.t. (it may be the same p.t. mapped
-             * at another virt addr).
-             * The ptwr_flush call below will restore the PRESENT bit.
-             */
-            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
-                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
-                which = PTWR_PT_ACTIVE;
-        }
-    }
-    
-    PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
-                "pfn %08lx\n", PTWR_PRINT_WHICH,
-                addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
-    
-    /*
-     * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 
-     * time. If there is already one, we must flush it out.
-     */
-    if ( ptwr_info[cpu].ptinfo[which].l1va )
-        ptwr_flush(which);
-
-    ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
-    ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
-    
-    /* For safety, disconnect the L1 p.t. page from current space. */
-    if ( (which == PTWR_PT_ACTIVE) && 
-         likely(!current->domain->arch.shadow_mode) )
-    {
-        *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
-#if 1
-        flush_tlb(); /* XXX Multi-CPU guests? */
-#else
-        flush_tlb_all();
-#endif
-    }
-    
-    /* Temporarily map the L1 page, and make a copy of it. */
-    ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
-    memcpy(ptwr_info[cpu].ptinfo[which].page,
-           ptwr_info[cpu].ptinfo[which].pl1e,
-           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
-    
-    /* Finally, make the p.t. page writable by the guest OS. */
-    pte |= _PAGE_RW;
-    PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
-                &linear_pg_table[addr>>PAGE_SHIFT], pte);
-    if ( unlikely(__put_user(pte, (unsigned long *)
-                             &linear_pg_table[addr>>PAGE_SHIFT])) )
-    {
-        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
-                &linear_pg_table[addr>>PAGE_SHIFT]);
-        /* Toss the writable pagetable state and crash. */
-        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
-        ptwr_info[cpu].ptinfo[which].l1va = 0;
-        domain_crash();
-    }
-    
-    return EXCRET_fault_fixed;
-}
-
-static __init int ptwr_init(void)
-{
-    int i;
-
-    for ( i = 0; i < smp_num_cpus; i++ )
-    {
-        ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
-            (void *)alloc_xenheap_page();
-        ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
-            (void *)alloc_xenheap_page();
-    }
-
-    return 0;
-}
-__initcall(ptwr_init);
-
-
-
-
-/************************************************************************/
-/************************************************************************/
-/************************************************************************/
-
-#ifndef NDEBUG
-
-void ptwr_status(void)
-{
-    unsigned long pte, *ptep, pfn;
-    struct pfn_info *page;
-    int cpu = smp_processor_id();
-
-    ptep = (unsigned long *)&linear_pg_table
-        [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
-
-    if ( __get_user(pte, ptep) ) {
-        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
-        domain_crash();
-    }
-
-    pfn = pte >> PAGE_SHIFT;
-    page = &frame_table[pfn];
-    printk("need to alloc l1 page %p\n", page);
-    /* make pt page writable */
-    printk("need to make read-only l1-page at %p is %08lx\n",
-           ptep, pte);
-
-    if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
-        return;
-
-    if ( __get_user(pte, (unsigned long *)
-                    ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
-        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
-                ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
-        domain_crash();
-    }
-    pfn = pte >> PAGE_SHIFT;
-    page = &frame_table[pfn];
-}
-
-void audit_domain(struct domain *d)
-{
-    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
-
-    void adjust (struct pfn_info *page, int dir, int adjtype)
-    {
-        int count = page->count_info & PGC_count_mask;
-
-        if ( adjtype )
-        {
-            int tcount = page->u.inuse.type_info & PGT_count_mask;
-            
-            ttot++;
-
-            tcount += dir;
-
-            if ( tcount < 0 )
-            {
-                /* This will only come out once. */
-                printk("Audit %d: type count whent below zero pfn=%x "
-                       "taf=%x otaf=%x\n",
-                       d->id, page-frame_table,
-                       page->u.inuse.type_info,
-                       page->tlbflush_timestamp);
-            }
-            
-            page->u.inuse.type_info =
-                (page->u.inuse.type_info & ~PGT_count_mask) | 
-                (tcount & PGT_count_mask);
-        }
-
-        ctot++;
-        count += dir;
-        if ( count < 0 )
-        {
-            /* This will only come out once. */
-            printk("Audit %d: general count whent below zero pfn=%x "
-                   "taf=%x otaf=%x\n",
-                   d->id, page-frame_table,
-                   page->u.inuse.type_info,
-                   page->tlbflush_timestamp);
-        }
-            
-        page->count_info =
-            (page->count_info & ~PGC_count_mask) | 
-            (count & PGC_count_mask);            
-
-    }
-
-    void scan_for_pfn(struct domain *d, unsigned long xpfn)
-    {
-        unsigned long pfn, *pt;
-        struct list_head *list_ent;
-        struct pfn_info *page;
-        int i;
-
-        list_ent = d->page_list.next;
-        for ( i = 0; (list_ent != &d->page_list); i++ )
-        {
-            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-            page = &frame_table[pfn];
-            
-            switch ( page->u.inuse.type_info & PGT_type_mask )
-            {
-            case PGT_l1_page_table:
-            case PGT_l2_page_table:
-                pt = map_domain_mem(pfn<<PAGE_SHIFT);
-                for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-                    if ( (pt[i] & _PAGE_PRESENT) &&
-                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
-                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
-                               d->id, i, pfn, page->u.inuse.type_info,
-                               page->count_info);
-                unmap_domain_mem(pt);           
-            }
-
-            list_ent = frame_table[pfn].list.next;
-        }
-
-    }
-
-    void scan_for_pfn_remote(unsigned long xpfn)
-    {
-        struct domain *e;
-        for_each_domain ( e )
-            scan_for_pfn( e, xpfn );            
-    }   
-
-    int i;
-    unsigned long pfn;
-    struct list_head *list_ent;
-    struct pfn_info *page;
-
-    if ( d != current->domain )
-        domain_pause(d);
-    synchronise_pagetables(~0UL);
-
-    printk("pt base=%lx sh_info=%x\n",
-           pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
-           virt_to_page(d->shared_info)-frame_table);
-           
-    spin_lock(&d->page_alloc_lock);
-
-    /* PHASE 0 */
-
-    list_ent = d->page_list.next;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
-        page = &frame_table[pfn];
-
-        if ( page_get_owner(page) != d )
-            BUG();
-
-        if ( (page->u.inuse.type_info & PGT_count_mask) >
-             (page->count_info & PGC_count_mask) )
-            printk("taf > caf %x %x pfn=%lx\n",
-                   page->u.inuse.type_info, page->count_info, pfn );
-#if 0   /* SYSV shared memory pages plus writeable files. */
-        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
-             (page->u.inuse.type_info & PGT_count_mask) > 1 )
-        {
-            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
-                  pfn,
-                  page->u.inuse.type_info,
-                  page->count_info );
-            scan_for_pfn_remote(pfn);
-        }
-#endif
-        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
-             (page->u.inuse.type_info & PGT_count_mask) > 1 )
-        {
-            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
-                  pfn,
-                  page->u.inuse.type_info,
-                  page->count_info );
-        }
-
-        /* Use tlbflush_timestamp to store original type_info. */
-        page->tlbflush_timestamp = page->u.inuse.type_info;
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-
-    /* PHASE 1 */
-
-    adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
-
-    list_ent = d->page_list.next;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        unsigned long *pt;
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
-        page = &frame_table[pfn];
-
-        if ( page_get_owner(page) != d )
-            BUG();
-
-        switch ( page->u.inuse.type_info & PGT_type_mask )
-        {
-        case PGT_l2_page_table:
-
-            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
-                printk("Audit %d: L2 not validated %x\n",
-                       d->id, page->u.inuse.type_info);
-
-            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
-                printk("Audit %d: L2 not pinned %x\n",
-                       d->id, page->u.inuse.type_info);
-            else
-                adjust( page, -1, 1 );
-           
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page = &frame_table[l1pfn];
-
-                    if ( page_get_owner(l1page) != d )
-                    {
-                        printk("L2: Skip bizarre page belonging to other "
-                               "dom %p\n", page_get_owner(l1page));
-                        continue;
-                    }
-                    
-                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
-                         PGT_l2_page_table )
-                        printk("Audit %d: [%x] Found %s Linear PT "
-                               "t=%x pfn=%lx\n", d->id, i, 
-                               (l1pfn==pfn) ? "Self" : "Other",
-                               l1page->u.inuse.type_info,
-                               l1pfn);
-                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
-                              PGT_l1_page_table )
-                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
-                               d->id, i,
-                               l1page->u.inuse.type_info,
-                               l1pfn);
-
-                    adjust(l1page, -1, 1);
-                }
-            }
-
-            unmap_domain_mem(pt);
-
-            break;
-
-
-        case PGT_l1_page_table:
-            
-            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
-                adjust( page, -1, 1 );
-
-            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
-                printk("Audit %d: L1 not validated %x\n",
-                       d->id, page->u.inuse.type_info);
-#if 0
-            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
-                printk("Audit %d: L1 not pinned %x\n",
-                       d->id, page->u.inuse.type_info);
-#endif
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page = &frame_table[l1pfn];
-
-                    if ( l1pfn < 0x100 )
-                    {
-                        lowmem_mappings++;
-                        continue;
-                    }
-
-                    if ( l1pfn > max_page )
-                    {
-                        io_mappings++;
-                        continue;
-                    }
-
-                    if ( pt[i] & _PAGE_RW )
-                    {
-
-                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
-                             PGT_l1_page_table ||
-                             (l1page->u.inuse.type_info & PGT_type_mask) ==
-                             PGT_l2_page_table )
-                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
-                                   d->id, i,
-                                   l1page->u.inuse.type_info,
-                                   l1pfn);
-
-                    }
-
-                    if ( page_get_owner(l1page) != d )
-                    {
-                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
-                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
-                               d->id, pfn, i,
-                               page_get_owner(l1page),
-                               l1pfn,
-                               l1page->count_info,
-                               l1page->u.inuse.type_info,
-                               machine_to_phys_mapping[l1pfn]);    
-                        continue;
-                    }
-
-                    adjust(l1page, -1, 0);
-                }
-            }
-
-            unmap_domain_mem(pt);
-
-            break;
-        }       
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
-        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
-               d->id, lowmem_mappings, io_mappings);
-
-    /* PHASE 2 */
-
-    ctot = ttot = 0;
-    list_ent = d->page_list.next;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-        page = &frame_table[pfn];
-
-        switch ( page->u.inuse.type_info & PGT_type_mask)
-        {
-        case PGT_l1_page_table:
-        case PGT_l2_page_table:
-            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
-            {
-                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
-                       d->id, page->u.inuse.type_info, 
-                       page->tlbflush_timestamp,
-                       page->count_info, pfn );
-                scan_for_pfn_remote(pfn);
-            }
-        default:
-            if ( (page->count_info & PGC_count_mask) != 1 )
-            {
-                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
-                       d->id, 
-                       page->count_info,
-                       page->u.inuse.type_info, 
-                       page->tlbflush_timestamp, pfn );
-                scan_for_pfn_remote(pfn);
-            }
-            break;
-        }
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-    /* PHASE 3 */
-    list_ent = d->page_list.next;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        unsigned long *pt;
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-        page = &frame_table[pfn];
-
-        switch ( page->u.inuse.type_info & PGT_type_mask )
-        {
-        case PGT_l2_page_table:
-            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
-                adjust( page, 1, 1 );          
-
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page;
-
-                    if (l1pfn>max_page)
-                        continue;
-
-                    l1page = &frame_table[l1pfn];
-
-                    if ( page_get_owner(l1page) == d )
-                        adjust(l1page, 1, 1);
-                }
-            }
-
-            unmap_domain_mem(pt);
-            break;
-
-        case PGT_l1_page_table:
-            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
-                adjust( page, 1, 1 );
-
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page;
-
-                    if (l1pfn>max_page)
-                        continue;
-
-                    l1page = &frame_table[l1pfn];
-
-                    if ( (page_get_owner(l1page) != d) ||
-                         (l1pfn < 0x100) || (l1pfn > max_page) )
-                        continue;
-
-                    adjust(l1page, 1, 0);
-                }
-            }
-
-            unmap_domain_mem(pt);
-            break;
-        }
-
-
-        page->tlbflush_timestamp = 0;
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-    spin_unlock(&d->page_alloc_lock);
-
-    adjust(&frame_table[pagetable_val(
-        d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
-
-    printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
-
-    if ( d != current->domain )
-        domain_unpause(d);
-}
-
-void audit_domains(void)
-{
-    struct domain *d;
-    for_each_domain ( d )
-        audit_domain(d);
-}
-
-void audit_domains_key(unsigned char key)
-{
-    audit_domains();
-}
-
-#endif
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
new file mode 100644 (file)
index 0000000..c2f87ee
--- /dev/null
@@ -0,0 +1,2592 @@
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
+/******************************************************************************
+ * arch/x86/mm.c
+ * 
+ * Copyright (c) 2002-2005 K A Fraser
+ * Copyright (c) 2004 Christian Limpach
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * A description of the x86 page table API:
+ * 
+ * Domains trap to do_mmu_update with a list of update requests.
+ * This is a list of (ptr, val) pairs, where the requested operation
+ * is *ptr = val.
+ * 
+ * Reference counting of pages:
+ * ----------------------------
+ * Each page has two refcounts: tot_count and type_count.
+ * 
+ * TOT_COUNT is the obvious reference count. It counts all uses of a
+ * physical page frame by a domain, including uses as a page directory,
+ * a page table, or simple mappings via a PTE. This count prevents a
+ * domain from releasing a frame back to the free pool when it still holds
+ * a reference to it.
+ * 
+ * TYPE_COUNT is more subtle. A frame can be put to one of three
+ * mutually-exclusive uses: it might be used as a page directory, or a
+ * page table, or it may be mapped writable by the domain [of course, a
+ * frame may not be used in any of these three ways!].
+ * So, type_count is a count of the number of times a frame is being 
+ * referred to in its current incarnation. Therefore, a page can only
+ * change its type when its type count is zero.
+ * 
+ * Pinning the page type:
+ * ----------------------
+ * The type of a page can be pinned/unpinned with the commands
+ * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
+ * pinning is not reference counted, so it can't be nested).
+ * This is useful to prevent a page's type count falling to zero, at which
+ * point safety checks would need to be carried out next time the count
+ * is increased again.
+ * 
+ * A further note on writable page mappings:
+ * -----------------------------------------
+ * For simplicity, the count of writable mappings for a page may not
+ * correspond to reality. The 'writable count' is incremented for every
+ * PTE which maps the page with the _PAGE_RW flag set. However, for
+ * write access to be possible the page directory entry must also have
+ * its _PAGE_RW bit set. We do not check this as it complicates the 
+ * reference counting considerably [consider the case of multiple
+ * directory entries referencing a single page table, some with the RW
+ * bit set, others not -- it starts getting a bit messy].
+ * In normal use, this simplification shouldn't be a problem.
+ * However, the logic can be added if required.
+ * 
+ * One more note on read-only page mappings:
+ * -----------------------------------------
+ * We want domains to be able to map pages for read-only access. The
+ * main reason is that page tables and directories should be readable
+ * by a domain, but it would not be safe for them to be writable.
+ * However, domains have free access to rings 1 & 2 of the Intel
+ * privilege model. In terms of page protection, these are considered
+ * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
+ * read-only restrictions are respected in supervisor mode -- if the 
+ * bit is clear then any mapped page is writable.
+ * 
+ * We get round this by always setting the WP bit and disallowing 
+ * updates to it. This is very unlikely to cause a problem for guest
+ * OS's, which will generally use the WP bit to simplify copy-on-write
+ * implementation (in that case, OS wants a fault when it writes to
+ * an application-supplied buffer).
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/lib.h>
+#include <xen/mm.h>
+#include <xen/sched.h>
+#include <xen/errno.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/softirq.h>
+#include <asm/shadow.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/domain_page.h>
+#include <asm/ldt.h>
+
+#ifdef VERBOSE
+#define MEM_LOG(_f, _a...)                           \
+  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
+         current->domain->id , __LINE__ , ## _a )
+#else
+#define MEM_LOG(_f, _a...) ((void)0)
+#endif
+
+static int alloc_l2_table(struct pfn_info *page);
+static int alloc_l1_table(struct pfn_info *page);
+static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
+static int get_page_and_type_from_pagenr(unsigned long page_nr, 
+                                         u32 type,
+                                         struct domain *d);
+
+static void free_l2_table(struct pfn_info *page);
+static void free_l1_table(struct pfn_info *page);
+
+static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
+static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
+
+/* Used to defer flushing of memory structures. */
+static struct {
+#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
+#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
+    unsigned long  deferred_ops;
+    /* If non-NULL, specifies a foreign subject domain for some operations. */
+    struct domain *foreign;
+} __cacheline_aligned percpu_info[NR_CPUS];
+
+/*
+ * Returns the current foreign domain; defaults to the currently-executing
+ * domain if a foreign override hasn't been specified.
+ */
+#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
+
+/* Private domain structs for DOMID_XEN and DOMID_IO. */
+static struct domain *dom_xen, *dom_io;
+
+/* Frame table and its size in pages. */
+struct pfn_info *frame_table;
+unsigned long frame_table_size;
+unsigned long max_page;
+
+void __init init_frametable(void)
+{
+    unsigned long i, p;
+
+    frame_table      = (struct pfn_info *)FRAMETABLE_VIRT_START;
+    frame_table_size = max_page * sizeof(struct pfn_info);
+    frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
+
+    for ( i = 0; i < frame_table_size; i += (4UL << 20) )
+    {
+        p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
+        if ( p == 0 )
+            panic("Not enough memory for frame table\n");
+        map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 
+                  4UL << 20, PAGE_HYPERVISOR);
+    }
+
+    memset(frame_table, 0, frame_table_size);
+}
+
+void arch_init_memory(void)
+{
+    extern void subarch_init_memory(struct domain *);
+
+    memset(percpu_info, 0, sizeof(percpu_info));
+
+    /*
+     * Initialise our DOMID_XEN domain.
+     * Any Xen-heap pages that we will allow to be mapped will have
+     * their domain field set to dom_xen.
+     */
+    dom_xen = alloc_domain_struct();
+    atomic_set(&dom_xen->refcnt, 1);
+    dom_xen->id = DOMID_XEN;
+
+    /*
+     * Initialise our DOMID_IO domain.
+     * This domain owns no pages but is considered a special case when
+     * mapping I/O pages, as the mappings occur at the priv of the caller.
+     */
+    dom_io = alloc_domain_struct();
+    atomic_set(&dom_io->refcnt, 1);
+    dom_io->id = DOMID_IO;
+
+    subarch_init_memory(dom_xen);
+}
+
+void write_ptbase(struct exec_domain *ed)
+{
+    struct domain *d = ed->domain;
+    unsigned long pa;
+
+#ifdef CONFIG_VMX
+    if ( unlikely(d->arch.shadow_mode) )
+        pa = ((d->arch.shadow_mode == SHM_full_32) ?
+              pagetable_val(ed->arch.monitor_table) :
+              pagetable_val(ed->arch.shadow_table));
+    else
+        pa = pagetable_val(ed->arch.pagetable);
+#else
+    if ( unlikely(d->arch.shadow_mode) )
+        pa = pagetable_val(ed->arch.shadow_table);    
+    else
+        pa = pagetable_val(ed->arch.pagetable);
+#endif
+
+    write_cr3(pa);
+}
+
+static void __invalidate_shadow_ldt(struct exec_domain *d)
+{
+    int i;
+    unsigned long pfn;
+    struct pfn_info *page;
+    
+    d->arch.shadow_ldt_mapcnt = 0;
+
+    for ( i = 16; i < 32; i++ )
+    {
+        pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
+        if ( pfn == 0 ) continue;
+        d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
+        page = &frame_table[pfn];
+        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
+        ASSERT_PAGE_IS_DOMAIN(page, d->domain);
+        put_page_and_type(page);
+    }
+
+    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
+    percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
+}
+
+
+static inline void invalidate_shadow_ldt(struct exec_domain *d)
+{
+    if ( d->arch.shadow_ldt_mapcnt != 0 )
+        __invalidate_shadow_ldt(d);
+}
+
+
+static int alloc_segdesc_page(struct pfn_info *page)
+{
+    struct desc_struct *descs;
+    int i;
+
+    descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
+
+    for ( i = 0; i < 512; i++ )
+        if ( unlikely(!check_descriptor(&descs[i])) )
+            goto fail;
+
+    unmap_domain_mem(descs);
+    return 1;
+
+ fail:
+    unmap_domain_mem(descs);
+    return 0;
+}
+
+
+/* Map shadow page at offset @off. */
+int map_ldt_shadow_page(unsigned int off)
+{
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain;
+    unsigned long l1e;
+
+    if ( unlikely(in_irq()) )
+        BUG();
+
+    __get_user(l1e, (unsigned long *)
+               &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
+
+    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
+         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
+                                     d, PGT_ldt_page)) )
+        return 0;
+
+    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
+    ed->arch.shadow_ldt_mapcnt++;
+
+    return 1;
+}
+
+
+static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
+{
+    struct pfn_info *page = &frame_table[page_nr];
+
+    if ( unlikely(!pfn_is_ram(page_nr)) )
+    {
+        MEM_LOG("Pfn %08lx is not RAM", page_nr);
+        return 0;
+    }
+
+    if ( unlikely(!get_page(page, d)) )
+    {
+        MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+static int get_page_and_type_from_pagenr(unsigned long page_nr, 
+                                         u32 type,
+                                         struct domain *d)
+{
+    struct pfn_info *page = &frame_table[page_nr];
+
+    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
+        return 0;
+
+    if ( unlikely(!get_page_type(page, type)) )
+    {
+#ifdef VERBOSE
+        if ( (type & PGT_type_mask) != PGT_l1_page_table )
+            MEM_LOG("Bad page type for pfn %08lx (%08x)", 
+                    page_nr, page->u.inuse.type_info);
+#endif
+        put_page(page);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+/*
+ * We allow an L2 tables to map each other (a.k.a. linear page tables). It
+ * needs some special care with reference counst and access permissions:
+ *  1. The mapping entry must be read-only, or the guest may get write access
+ *     to its own PTEs.
+ *  2. We must only bump the reference counts for an *already validated*
+ *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
+ *     on a validation that is required to complete that validation.
+ *  3. We only need to increment the reference counts for the mapped page
+ *     frame if it is mapped by a different L2 table. This is sufficient and
+ *     also necessary to allow validation of an L2 table mapping itself.
+ */
+static int 
+get_linear_pagetable(
+    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
+{
+    u32 x, y;
+    struct pfn_info *page;
+
+    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
+    {
+        MEM_LOG("Attempt to create linear p.t. with write perms");
+        return 0;
+    }
+
+    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
+    {
+        /* Make sure the mapped frame belongs to the correct domain. */
+        if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
+            return 0;
+
+        /*
+         * Make sure that the mapped frame is an already-validated L2 table. 
+         * If so, atomically increment the count (checking for overflow).
+         */
+        page = &frame_table[l2_pgentry_to_pagenr(l2e)];
+        y = page->u.inuse.type_info;
+        do {
+            x = y;
+            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
+                 unlikely((x & (PGT_type_mask|PGT_validated)) != 
+                          (PGT_l2_page_table|PGT_validated)) )
+            {
+                put_page(page);
+                return 0;
+            }
+        }
+        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
+    }
+
+    return 1;
+}
+
+
+static int
+get_page_from_l1e(
+    l1_pgentry_t l1e, struct domain *d)
+{
+    unsigned long l1v = l1_pgentry_val(l1e);
+    unsigned long pfn = l1_pgentry_to_pagenr(l1e);
+    struct pfn_info *page = &frame_table[pfn];
+    extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
+
+    if ( !(l1v & _PAGE_PRESENT) )
+        return 1;
+
+    if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
+    {
+        MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
+        return 0;
+    }
+
+    if ( unlikely(!pfn_is_ram(pfn)) )
+    {
+        /* Revert to caller privileges if FD == DOMID_IO. */
+        if ( d == dom_io )
+            d = current->domain;
+
+        if ( IS_PRIV(d) )
+            return 1;
+
+        if ( IS_CAPABLE_PHYSDEV(d) )
+            return domain_iomem_in_pfn(d, pfn);
+
+        MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
+        return 0;
+    }
+
+    return ((l1v & _PAGE_RW) ?
+            get_page_and_type(page, d, PGT_writable_page) :
+            get_page(page, d));
+}
+
+
+/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+static int 
+get_page_from_l2e(
+    l2_pgentry_t l2e, unsigned long pfn,
+    struct domain *d, unsigned long va_idx)
+{
+    int rc;
+
+    if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
+        return 1;
+
+    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
+    {
+        MEM_LOG("Bad L2 page type settings %04lx",
+                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
+        return 0;
+    }
+
+    rc = get_page_and_type_from_pagenr(
+        l2_pgentry_to_pagenr(l2e), 
+        PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
+
+    if ( unlikely(!rc) )
+        return get_linear_pagetable(l2e, pfn, d);
+
+    return 1;
+}
+
+
+static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
+{
+    unsigned long    l1v  = l1_pgentry_val(l1e);
+    unsigned long    pfn  = l1_pgentry_to_pagenr(l1e);
+    struct pfn_info *page = &frame_table[pfn];
+    struct domain   *e;
+
+    if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
+        return;
+
+    e = page_get_owner(page);
+    if ( unlikely(e != d) )
+    {
+        /*
+         * Unmap a foreign page that may have been mapped via a grant table.
+         * Note that this can fail for a privileged domain that can map foreign
+         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
+         * counted via a grant entry and some counted directly in the page
+         * structure's reference count. Note that reference counts won't get
+         * dangerously confused as long as we always try to decrement the
+         * grant entry first. We may end up with a mismatch between which
+         * mappings and which unmappings are counted via the grant entry, but
+         * really it doesn't matter as privileged domains have carte blanche.
+         */
+        if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
+            return;
+        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
+    }
+
+    if ( l1v & _PAGE_RW )
+    {
+        put_page_and_type(page);
+    }
+    else
+    {
+        /* We expect this is rare so we blow the entire shadow LDT. */
+        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
+                       PGT_ldt_page)) &&
+             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
+            invalidate_shadow_ldt(e->exec_domain[0]);
+        put_page(page);
+    }
+}
+
+
+/*
+ * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
+ * Note also that this automatically deals correctly with linear p.t.'s.
+ */
+static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+{
+    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
+         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
+        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
+}
+
+
+static int alloc_l2_table(struct pfn_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long  page_nr = page_to_pfn(page);
+    l2_pgentry_t  *pl2e;
+    int            i;
+   
+    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
+
+    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
+            goto fail;
+
+#if defined(__i386__)
+    /* Now we add our private high mappings. */
+    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 
+                      __PAGE_HYPERVISOR);
+#endif
+
+    unmap_domain_mem(pl2e);
+    return 1;
+
+ fail:
+    while ( i-- > 0 )
+        put_page_from_l2e(pl2e[i], page_nr);
+
+    unmap_domain_mem(pl2e);
+    return 0;
+}
+
+
+static int alloc_l1_table(struct pfn_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long  page_nr = page_to_pfn(page);
+    l1_pgentry_t  *pl1e;
+    int            i;
+
+    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
+
+    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+        if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
+            goto fail;
+
+    unmap_domain_mem(pl1e);
+    return 1;
+
+ fail:
+    while ( i-- > 0 )
+        put_page_from_l1e(pl1e[i], d);
+
+    unmap_domain_mem(pl1e);
+    return 0;
+}
+
+
+static void free_l2_table(struct pfn_info *page)
+{
+    unsigned long page_nr = page - frame_table;
+    l2_pgentry_t *pl2e;
+    int i;
+
+    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
+
+    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+        put_page_from_l2e(pl2e[i], page_nr);
+
+    unmap_domain_mem(pl2e);
+}
+
+
+static void free_l1_table(struct pfn_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long page_nr = page - frame_table;
+    l1_pgentry_t *pl1e;
+    int i;
+
+    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
+
+    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+        put_page_from_l1e(pl1e[i], d);
+
+    unmap_domain_mem(pl1e);
+}
+
+
+static inline int update_l2e(l2_pgentry_t *pl2e, 
+                             l2_pgentry_t  ol2e, 
+                             l2_pgentry_t  nl2e)
+{
+    unsigned long o = cmpxchg((unsigned long *)pl2e, 
+                              l2_pgentry_val(ol2e), 
+                              l2_pgentry_val(nl2e));
+    if ( o != l2_pgentry_val(ol2e) )
+        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
+                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
+    return (o == l2_pgentry_val(ol2e));
+}
+
+
+/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
+static int mod_l2_entry(l2_pgentry_t *pl2e, 
+                        l2_pgentry_t nl2e, 
+                        unsigned long pfn)
+{
+    l2_pgentry_t ol2e;
+    unsigned long _ol2e;
+
+    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
+                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
+    {
+        MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
+        return 0;
+    }
+
+    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
+        return 0;
+    ol2e = mk_l2_pgentry(_ol2e);
+
+    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
+    {
+        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
+        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
+            return update_l2e(pl2e, ol2e, nl2e);
+
+        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
+                                        ((unsigned long)pl2e & 
+                                         ~PAGE_MASK) >> 2)) )
+            return 0;
+
+        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+        {
+            put_page_from_l2e(nl2e, pfn);
+            return 0;
+        }
+        
+        put_page_from_l2e(ol2e, pfn);
+        return 1;
+    }
+
+    if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+        return 0;
+
+    put_page_from_l2e(ol2e, pfn);
+    return 1;
+}
+
+
+static inline int update_l1e(l1_pgentry_t *pl1e, 
+                             l1_pgentry_t  ol1e, 
+                             l1_pgentry_t  nl1e)
+{
+    unsigned long o = l1_pgentry_val(ol1e);
+    unsigned long n = l1_pgentry_val(nl1e);
+
+    if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
+         unlikely(o != l1_pgentry_val(ol1e)) )
+    {
+        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
+                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+/* Update the L1 entry at pl1e to new value nl1e. */
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+{
+    l1_pgentry_t ol1e;
+    unsigned long _ol1e;
+    struct domain *d = current->domain;
+
+    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
+    {
+        MEM_LOG("Bad get_user\n");
+        return 0;
+    }
+    
+    ol1e = mk_l1_pgentry(_ol1e);
+
+    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
+    {
+        /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
+        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
+            return update_l1e(pl1e, ol1e, nl1e);
+
+        if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
+            return 0;
+        
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        {
+            put_page_from_l1e(nl1e, d);
+            return 0;
+        }
+        
+        put_page_from_l1e(ol1e, d);
+        return 1;
+    }
+
+    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        return 0;
+    
+    put_page_from_l1e(ol1e, d);
+    return 1;
+}
+
+
+int alloc_page_type(struct pfn_info *page, unsigned int type)
+{
+    switch ( type )
+    {
+    case PGT_l1_page_table:
+        return alloc_l1_table(page);
+    case PGT_l2_page_table:
+        return alloc_l2_table(page);
+    case PGT_gdt_page:
+    case PGT_ldt_page:
+        return alloc_segdesc_page(page);
+    default:
+        printk("Bad type in alloc_page_type %x t=%x c=%x\n", 
+               type, page->u.inuse.type_info,
+               page->count_info);
+        BUG();
+    }
+
+    return 0;
+}
+
+
+void free_page_type(struct pfn_info *page, unsigned int type)
+{
+    struct domain *d = page_get_owner(page);
+
+    switch ( type )
+    {
+    case PGT_l1_page_table:
+        free_l1_table(page);
+        break;
+
+    case PGT_l2_page_table:
+        free_l2_table(page);
+        break;
+
+    default:
+        BUG();
+    }
+
+    if ( unlikely(d->arch.shadow_mode) && 
+         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
+    {
+        unshadow_table(page_to_pfn(page), type);
+        put_shadow_status(d);
+    }
+}
+
+
+void put_page_type(struct pfn_info *page)
+{
+    u32 nx, x, y = page->u.inuse.type_info;
+
+ again:
+    do {
+        x  = y;
+        nx = x - 1;
+
+        ASSERT((x & PGT_count_mask) != 0);
+
+        /*
+         * The page should always be validated while a reference is held. The 
+         * exception is during domain destruction, when we forcibly invalidate 
+         * page-table pages if we detect a referential loop.
+         * See domain.c:relinquish_list().
+         */
+        ASSERT((x & PGT_validated) || 
+               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
+
+        if ( unlikely((nx & PGT_count_mask) == 0) )
+        {
+            /* Record TLB information for flush later. Races are harmless. */
+            page->tlbflush_timestamp = tlbflush_current_time();
+            
+            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
+                 likely(nx & PGT_validated) )
+            {
+                /*
+                 * Page-table pages must be unvalidated when count is zero. The
+                 * 'free' is safe because the refcnt is non-zero and validated
+                 * bit is clear => other ops will spin or fail.
+                 */
+                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
+                                           x & ~PGT_validated)) != x) )
+                    goto again;
+                /* We cleared the 'valid bit' so we do the clear up. */
+                free_page_type(page, x & PGT_type_mask);
+                /* Carry on, but with the 'valid bit' now clear. */
+                x  &= ~PGT_validated;
+                nx &= ~PGT_validated;
+            }
+        }
+        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
+                           (PGT_pinned | 1)) )
+        {
+            /* Page is now only pinned. Make the back pointer mutable again. */
+            nx |= PGT_va_mutable;
+        }
+    }
+    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+}
+
+
+int get_page_type(struct pfn_info *page, u32 type)
+{
+    u32 nx, x, y = page->u.inuse.type_info;
+
+ again:
+    do {
+        x  = y;
+        nx = x + 1;
+        if ( unlikely((nx & PGT_count_mask) == 0) )
+        {
+            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
+            return 0;
+        }
+        else if ( unlikely((x & PGT_count_mask) == 0) )
+        {
+            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
+            {
+                /*
+                 * On type change we check to flush stale TLB entries. This 
+                 * may be unnecessary (e.g., page was GDT/LDT) but those
+                 * circumstances should be very rare.
+                 */
+                struct domain *d = page_get_owner(page);
+                if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
+                                         page->tlbflush_timestamp)) )
+                {
+                    perfc_incr(need_flush_tlb_flush);
+                    flush_tlb_cpu(d->exec_domain[0]->processor);
+                }
+
+                /* We lose existing type, back pointer, and validity. */
+                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
+                nx |= type;
+
+                /* No special validation needed for writable pages. */
+                /* Page tables and GDT/LDT need to be scanned for validity. */
+                if ( type == PGT_writable_page )
+                    nx |= PGT_validated;
+            }
+        }
+        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
+        {
+            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
+            {
+                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
+                     ((type & PGT_type_mask) != PGT_l1_page_table) )
+                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
+                            x & PGT_type_mask, type, page_to_pfn(page));
+                return 0;
+            }
+            else if ( (x & PGT_va_mask) == PGT_va_mutable )
+            {
+                /* The va backpointer is mutable, hence we update it. */
+                nx &= ~PGT_va_mask;
+                nx |= type; /* we know the actual type is correct */
+            }
+            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
+            {
+                /* This table is potentially mapped at multiple locations. */
+                nx &= ~PGT_va_mask;
+                nx |= PGT_va_unknown;
+            }
+        }
+        else if ( unlikely(!(x & PGT_validated)) )
+        {
+            /* Someone else is updating validation of this page. Wait... */
+            while ( (y = page->u.inuse.type_info) == x )
+            {
+                rep_nop();
+                barrier();
+            }
+            goto again;
+        }
+    }
+    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+
+    if ( unlikely(!(nx & PGT_validated)) )
+    {
+        /* Try to validate page type; drop the new reference on failure. */
+        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
+        {
+            MEM_LOG("Error while validating pfn %08lx for type %08x."
+                    " caf=%08x taf=%08x\n",
+                    page_to_pfn(page), type,
+                    page->count_info,
+                    page->u.inuse.type_info);
+            /* Noone else can get a reference. We hold the only ref. */
+            page->u.inuse.type_info = 0;
+            return 0;
+        }
+
+        /* Noone else is updating simultaneously. */
+        __set_bit(_PGT_validated, &page->u.inuse.type_info);
+    }
+
+    return 1;
+}
+
+
+int new_guest_cr3(unsigned long pfn)
+{
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain;
+    int okay, cpu = smp_processor_id();
+    unsigned long old_base_pfn;
+    
+    okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
+    if ( likely(okay) )
+    {
+        invalidate_shadow_ldt(ed);
+
+        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
+        old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
+        ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
+
+        shadow_mk_pagetable(ed);
+
+        write_ptbase(ed);
+
+        put_page_and_type(&frame_table[old_base_pfn]);
+    }
+    else
+    {
+        MEM_LOG("Error while installing new baseptr %08lx", pfn);
+    }
+
+    return okay;
+}
+
+static int do_extended_command(unsigned long ptr, unsigned long val)
+{
+    int okay = 1, cpu = smp_processor_id();
+    unsigned int cmd = val & MMUEXT_CMD_MASK;
+    unsigned long pfn = ptr >> PAGE_SHIFT;
+    struct pfn_info *page = &frame_table[pfn];
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain, *nd, *e;
+    u32 x, y;
+    domid_t domid;
+    grant_ref_t gntref;
+
+    switch ( cmd )
+    {
+    case MMUEXT_PIN_L1_TABLE:
+    case MMUEXT_PIN_L2_TABLE:
+        /*
+         * We insist that, if you pin an L1 page, it's the first thing that
+         * you do to it. This is because we require the backptr to still be
+         * mutable. This assumption seems safe.
+         */
+        okay = get_page_and_type_from_pagenr(
+            pfn, 
+            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
+             PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
+            FOREIGNDOM);
+
+        if ( unlikely(!okay) )
+        {
+            MEM_LOG("Error while pinning pfn %08lx", pfn);
+            break;
+        }
+
+        if ( unlikely(test_and_set_bit(_PGT_pinned,
+                                       &page->u.inuse.type_info)) )
+        {
+            MEM_LOG("Pfn %08lx already pinned", pfn);
+            put_page_and_type(page);
+            okay = 0;
+            break;
+        }
+
+        break;
+
+    case MMUEXT_UNPIN_TABLE:
+        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
+        {
+            MEM_LOG("Page %08lx bad domain (dom=%p)",
+                    ptr, page_get_owner(page));
+        }
+        else if ( likely(test_and_clear_bit(_PGT_pinned, 
+                                            &page->u.inuse.type_info)) )
+        {
+            put_page_and_type(page);
+            put_page(page);
+        }
+        else
+        {
+            okay = 0;
+            put_page(page);
+            MEM_LOG("Pfn %08lx not pinned", pfn);
+        }
+        break;
+
+    case MMUEXT_NEW_BASEPTR:
+        okay = new_guest_cr3(pfn);
+        break;
+        
+    case MMUEXT_TLB_FLUSH:
+        percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
+        break;
+    
+    case MMUEXT_INVLPG:
+        __flush_tlb_one(ptr);
+        break;
+
+    case MMUEXT_FLUSH_CACHE:
+        if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
+        {
+            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
+            okay = 0;
+        }
+        else
+        {
+            wbinvd();
+        }
+        break;
+
+    case MMUEXT_SET_LDT:
+    {
+        unsigned long ents = val >> MMUEXT_CMD_SHIFT;
+        if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
+             (ents > 8192) ||
+             ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
+             ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
+        {
+            okay = 0;
+            MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
+        }
+        else if ( (ed->arch.ldt_ents != ents) || 
+                  (ed->arch.ldt_base != ptr) )
+        {
+            invalidate_shadow_ldt(ed);
+            ed->arch.ldt_base = ptr;
+            ed->arch.ldt_ents = ents;
+            load_LDT(ed);
+            percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
+            if ( ents != 0 )
+                percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
+        }
+        break;
+    }
+
+    case MMUEXT_SET_FOREIGNDOM:
+        domid = (domid_t)(val >> 16);
+
+        if ( (e = percpu_info[cpu].foreign) != NULL )
+            put_domain(e);
+        percpu_info[cpu].foreign = NULL;
+
+        if ( !IS_PRIV(d) )
+        {
+            switch ( domid )
+            {
+            case DOMID_IO:
+                get_knownalive_domain(dom_io);
+                percpu_info[cpu].foreign = dom_io;
+                break;
+            default:
+                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
+                okay = 0;
+                break;
+            }
+        }
+        else
+        {
+            percpu_info[cpu].foreign = e = find_domain_by_id(domid);
+            if ( e == NULL )
+            {
+                switch ( domid )
+                {
+                case DOMID_XEN:
+                    get_knownalive_domain(dom_xen);
+                    percpu_info[cpu].foreign = dom_xen;
+                    break;
+                case DOMID_IO:
+                    get_knownalive_domain(dom_io);
+                    percpu_info[cpu].foreign = dom_io;
+                    break;
+                default:
+                    MEM_LOG("Unknown domain '%u'", domid);
+                    okay = 0;
+                    break;
+                }
+            }
+        }
+        break;
+
+    case MMUEXT_TRANSFER_PAGE:
+        domid  = (domid_t)(val >> 16);
+        gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
+        
+        if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
+             unlikely(!pfn_is_ram(pfn)) ||
+             unlikely((e = find_domain_by_id(domid)) == NULL) )
+        {
+            MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
+            okay = 0;
+            break;
+        }
+
+        spin_lock(&d->page_alloc_lock);
+
+        /*
+         * The tricky bit: atomically release ownership while there is just one
+         * benign reference to the page (PGC_allocated). If that reference
+         * disappears then the deallocation routine will safely spin.
+         */
+        nd = page_get_owner(page);
+        y  = page->count_info;
+        do {
+            x = y;
+            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
+                          (1|PGC_allocated)) ||
+                 unlikely(nd != d) )
+            {
+                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
+                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
+                        d, d->id, nd, x, page->u.inuse.type_info);
+                spin_unlock(&d->page_alloc_lock);
+                put_domain(e);
+                return 0;
+            }
+            __asm__ __volatile__(
+                LOCK_PREFIX "cmpxchg8b %2"
+                : "=d" (nd), "=a" (y),
+                "=m" (*(volatile u64 *)(&page->count_info))
+                : "0" (d), "1" (x), "c" (NULL), "b" (x) );
+        } 
+        while ( unlikely(nd != d) || unlikely(y != x) );
+
+        /*
+         * Unlink from 'd'. At least one reference remains (now anonymous), so
+         * noone else is spinning to try to delete this page from 'd'.
+         */
+        d->tot_pages--;
+        list_del(&page->list);
+        
+        spin_unlock(&d->page_alloc_lock);
+
+        spin_lock(&e->page_alloc_lock);
+
+        /*
+         * Check that 'e' will accept the page and has reservation headroom.
+         * Also, a domain mustn't have PGC_allocated pages when it is dying.
+         */
+        ASSERT(e->tot_pages <= e->max_pages);
+        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
+             unlikely(e->tot_pages == e->max_pages) ||
+             unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
+        {
+            MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
+                    "provided a bad grant ref, or is dying (%08lx).\n",
+                    e->tot_pages, e->max_pages, e->d_flags);
+            spin_unlock(&e->page_alloc_lock);
+            put_domain(e);
+            okay = 0;
+            break;
+        }
+
+        /* Okay, add the page to 'e'. */
+        if ( unlikely(e->tot_pages++ == 0) )
+            get_knownalive_domain(e);
+        list_add_tail(&page->list, &e->page_list);
+        page_set_owner(page, e);
+
+        spin_unlock(&e->page_alloc_lock);
+
+        /* Transfer is all done: tell the guest about its new page frame. */
+        gnttab_notify_transfer(e, gntref, pfn);
+        
+        put_domain(e);
+        break;
+
+    case MMUEXT_REASSIGN_PAGE:
+        if ( unlikely(!IS_PRIV(d)) )
+        {
+            MEM_LOG("Dom %u has no reassignment priv", d->id);
+            okay = 0;
+            break;
+        }
+
+        e = percpu_info[cpu].foreign;
+        if ( unlikely(e == NULL) )
+        {
+            MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
+            okay = 0;
+            break;
+        }
+
+        /*
+         * Grab both page_list locks, in order. This prevents the page from
+         * disappearing elsewhere while we modify the owner, and we'll need
+         * both locks if we're successful so that we can change lists.
+         */
+        if ( d < e )
+        {
+            spin_lock(&d->page_alloc_lock);
+            spin_lock(&e->page_alloc_lock);
+        }
+        else
+        {
+            spin_lock(&e->page_alloc_lock);
+            spin_lock(&d->page_alloc_lock);
+        }
+
+        /* A domain shouldn't have PGC_allocated pages when it is dying. */
+        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
+             unlikely(IS_XEN_HEAP_FRAME(page)) )
+        {
+            MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
+            okay = 0;
+            goto reassign_fail;
+        }
+
+        /*
+         * The tricky bit: atomically change owner while there is just one
+         * benign reference to the page (PGC_allocated). If that reference
+         * disappears then the deallocation routine will safely spin.
+         */
+        nd = page_get_owner(page);
+        y  = page->count_info;
+        do {
+            x = y;
+            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
+                          (1|PGC_allocated)) ||
+                 unlikely(nd != d) )
+            {
+                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
+                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
+                        d, d->id, nd, x, page->u.inuse.type_info);
+                okay = 0;
+                goto reassign_fail;
+            }
+            __asm__ __volatile__(
+                LOCK_PREFIX "cmpxchg8b %3"
+                : "=d" (nd), "=a" (y), "=c" (e),
+                "=m" (*(volatile u64 *)(&page->count_info))
+                : "0" (d), "1" (x), "c" (e), "b" (x) );
+        } 
+        while ( unlikely(nd != d) || unlikely(y != x) );
+        
+        /*
+         * Unlink from 'd'. We transferred at least one reference to 'e', so
+         * noone else is spinning to try to delete this page from 'd'.
+         */
+        d->tot_pages--;
+        list_del(&page->list);
+        
+        /*
+         * Add the page to 'e'. Someone may already have removed the last
+         * reference and want to remove the page from 'e'. However, we have
+         * the lock so they'll spin waiting for us.
+         */
+        if ( unlikely(e->tot_pages++ == 0) )
+            get_knownalive_domain(e);
+        list_add_tail(&page->list, &e->page_list);
+
+    reassign_fail:        
+        spin_unlock(&d->page_alloc_lock);
+        spin_unlock(&e->page_alloc_lock);
+        break;
+
+    case MMUEXT_CLEAR_FOREIGNDOM:
+        if ( (e = percpu_info[cpu].foreign) != NULL )
+            put_domain(e);
+        percpu_info[cpu].foreign = NULL;
+        break;
+
+    default:
+        MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
+        okay = 0;
+        break;
+    }
+
+    return okay;
+}
+
+int do_mmu_update(
+    mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
+{
+/*
+ * We steal the m.s.b. of the @count parameter to indicate whether this
+ * invocation of do_mmu_update() is resuming a previously preempted call.
+ * We steal the next 15 bits to remember the current FOREIGNDOM.
+ */
+#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
+#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
+#define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
+
+    mmu_update_t req;
+    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
+    struct pfn_info *page;
+    int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
+    unsigned int cmd, done = 0;
+    unsigned long prev_spfn = 0;
+    l1_pgentry_t *prev_spl1e = 0;
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain;
+    u32 type_info;
+    domid_t domid;
+
+    LOCK_BIGLOCK(d);
+
+    cleanup_writable_pagetable(d);
+
+    /*
+     * If we are resuming after preemption, read how much work we have already
+     * done. This allows us to set the @done output parameter correctly.
+     * We also reset FOREIGNDOM here.
+     */
+    if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
+    {
+        if ( !(count & MMU_UPDATE_PREEMPTED) )
+        {
+            /* Count overflow into private FOREIGNDOM field. */
+            MEM_LOG("do_mmu_update count is too large");
+            rc = -EINVAL;
+            goto out;
+        }
+        count &= ~MMU_UPDATE_PREEMPTED;
+        domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
+        count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
+        if ( unlikely(pdone != NULL) )
+            (void)get_user(done, pdone);
+        if ( (domid != current->domain->id) &&
+             !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
+        {
+            rc = -EINVAL;
+            goto out;
+        }
+    }
+
+    perfc_incrc(calls_to_mmu_update); 
+    perfc_addc(num_page_updates, count);
+
+    if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
+    {
+        rc = -EFAULT;
+        goto out;
+    }
+
+    for ( i = 0; i < count; i++ )
+    {
+        if ( hypercall_preempt_check() )
+        {
+            rc = hypercall3_create_continuation(
+                __HYPERVISOR_mmu_update, ureqs, 
+                (count - i) |
+                (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 
+                MMU_UPDATE_PREEMPTED, pdone);
+            break;
+        }
+
+        if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
+        {
+            MEM_LOG("Bad __copy_from_user");
+            rc = -EFAULT;
+            break;
+        }
+
+        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
+        pfn = req.ptr >> PAGE_SHIFT;
+
+        okay = 0;
+
+        switch ( cmd )
+        {
+            /*
+             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
+             */
+        case MMU_NORMAL_PT_UPDATE:
+            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
+            {
+                MEM_LOG("Could not get page for normal update");
+                break;
+            }
+
+            if ( likely(prev_pfn == pfn) )
+            {
+                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
+            }
+            else
+            {
+                if ( prev_pfn != 0 )
+                    unmap_domain_mem((void *)va);
+                va = (unsigned long)map_domain_mem(req.ptr);
+                prev_pfn = pfn;
+            }
+
+            page = &frame_table[pfn];
+            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
+            {
+            case PGT_l1_page_table: 
+                if ( likely(get_page_type(
+                    page, type_info & (PGT_type_mask|PGT_va_mask))) )
+                {
+                    okay = mod_l1_entry((l1_pgentry_t *)va, 
+                                        mk_l1_pgentry(req.val)); 
+
+                    if ( unlikely(d->arch.shadow_mode) && okay &&
+                         (get_shadow_status(d, page-frame_table) &
+                          PSH_shadowed) )
+                    {
+                        shadow_l1_normal_pt_update(
+                            req.ptr, req.val, &prev_spfn, &prev_spl1e);
+                        put_shadow_status(d);
+                    }
+
+                    put_page_type(page);
+                }
+                break;
+            case PGT_l2_page_table:
+                if ( likely(get_page_type(page, PGT_l2_page_table)) )
+                {
+                    okay = mod_l2_entry((l2_pgentry_t *)va, 
+                                        mk_l2_pgentry(req.val),
+                                        pfn); 
+
+                    if ( unlikely(d->arch.shadow_mode) && okay &&
+                         (get_shadow_status(d, page-frame_table) & 
+                          PSH_shadowed) )
+                    {
+                        shadow_l2_normal_pt_update(req.ptr, req.val);
+                        put_shadow_status(d);
+                    }
+
+                    put_page_type(page);
+                }
+                break;
+            default:
+                if ( likely(get_page_type(page, PGT_writable_page)) )
+                {
+                    *(unsigned long *)va = req.val;
+                    okay = 1;
+                    put_page_type(page);
+                }
+                break;
+            }
+
+            put_page(page);
+            break;
+
+        case MMU_MACHPHYS_UPDATE:
+            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
+            {
+                MEM_LOG("Could not get page for mach->phys update");
+                break;
+            }
+
+            machine_to_phys_mapping[pfn] = req.val;
+            okay = 1;
+
+            /*
+             * If in log-dirty mode, mark the corresponding pseudo-physical
+             * page as dirty.
+             */
+            if ( unlikely(d->arch.shadow_mode == SHM_logdirty) && 
+                 mark_dirty(d, pfn) )
+                d->arch.shadow_dirty_block_count++;
+
+            put_page(&frame_table[pfn]);
+            break;
+
+            /*
+             * MMU_EXTENDED_COMMAND: Extended command is specified
+             * in the least-siginificant bits of the 'value' field.
+             */
+        case MMU_EXTENDED_COMMAND:
+            req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
+            okay = do_extended_command(req.ptr, req.val);
+            break;
+
+        default:
+            MEM_LOG("Invalid page update command %08lx", req.ptr);
+            break;
+        }
+
+        if ( unlikely(!okay) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+
+        ureqs++;
+    }
+
+ out:
+    if ( prev_pfn != 0 )
+        unmap_domain_mem((void *)va);
+
+    if ( unlikely(prev_spl1e != 0) ) 
+        unmap_domain_mem((void *)prev_spl1e);
+
+    deferred_ops = percpu_info[cpu].deferred_ops;
+    percpu_info[cpu].deferred_ops = 0;
+
+    if ( deferred_ops & DOP_FLUSH_TLB )
+        local_flush_tlb();
+        
+    if ( deferred_ops & DOP_RELOAD_LDT )
+        (void)map_ldt_shadow_page(0);
+
+    if ( unlikely(percpu_info[cpu].foreign != NULL) )
+    {
+        put_domain(percpu_info[cpu].foreign);
+        percpu_info[cpu].foreign = NULL;
+    }
+
+    /* Add incremental work we have done to the @done output parameter. */
+    if ( unlikely(pdone != NULL) )
+        __put_user(done + i, pdone);
+
+    UNLOCK_BIGLOCK(d);
+    return rc;
+}
+
+
+int do_update_va_mapping(unsigned long va,
+                         unsigned long val, 
+                         unsigned long flags)
+{
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain;
+    int err = 0;
+    unsigned int cpu = ed->processor;
+    unsigned long deferred_ops;
+
+    perfc_incrc(calls_to_update_va);
+
+    if ( unlikely(!__addr_ok(va)) )
+        return -EINVAL;
+
+    LOCK_BIGLOCK(d);
+
+    cleanup_writable_pagetable(d);
+
+    /*
+     * XXX When we make this support 4MB superpages we should also deal with 
+     * the case of updating L2 entries.
+     */
+
+    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
+                                mk_l1_pgentry(val))) )
+        err = -EINVAL;
+
+    if ( unlikely(d->arch.shadow_mode) )
+    {
+        unsigned long sval;
+
+        l1pte_propagate_from_guest(d, &val, &sval);
+
+        if ( unlikely(__put_user(sval, ((unsigned long *)(
+            &shadow_linear_pg_table[l1_linear_offset(va)])))) )
+        {
+            /*
+             * Since L2's are guranteed RW, failure indicates the page was not 
+             * shadowed, so ignore.
+             */
+            perfc_incrc(shadow_update_va_fail);
+        }
+
+        /*
+         * If we're in log-dirty mode then we need to note that we've updated
+         * the PTE in the PT-holding page. We need the machine frame number
+         * for this.
+         */
+        if ( d->arch.shadow_mode == SHM_logdirty )
+            mark_dirty(d, va_to_l1mfn(va));
+  
+        check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
+    }
+
+    deferred_ops = percpu_info[cpu].deferred_ops;
+    percpu_info[cpu].deferred_ops = 0;
+
+    if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
+         unlikely(flags & UVMF_FLUSH_TLB) )
+        local_flush_tlb();
+    else if ( unlikely(flags & UVMF_INVLPG) )
+        __flush_tlb_one(va);
+
+    if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
+        (void)map_ldt_shadow_page(0);
+    
+    UNLOCK_BIGLOCK(d);
+
+    return err;
+}
+
+int do_update_va_mapping_otherdomain(unsigned long va,
+                                     unsigned long val, 
+                                     unsigned long flags,
+                                     domid_t domid)
+{
+    unsigned int cpu = smp_processor_id();
+    struct domain *d;
+    int rc;
+
+    if ( unlikely(!IS_PRIV(current->domain)) )
+        return -EPERM;
+
+    percpu_info[cpu].foreign = d = find_domain_by_id(domid);
+    if ( unlikely(d == NULL) )
+    {
+        MEM_LOG("Unknown domain '%u'", domid);
+        return -ESRCH;
+    }
+
+    rc = do_update_va_mapping(va, val, flags);
+
+    put_domain(d);
+    percpu_info[cpu].foreign = NULL;
+
+    return rc;
+}
+
+
+
+/*************************
+ * Descriptor Tables
+ */
+
+void destroy_gdt(struct exec_domain *ed)
+{
+    int i;
+    unsigned long pfn;
+
+    for ( i = 0; i < 16; i++ )
+    {
+        if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
+            put_page_and_type(&frame_table[pfn]);
+        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
+    }
+}
+
+
+long set_gdt(struct exec_domain *ed, 
+             unsigned long *frames,
+             unsigned int entries)
+{
+    struct domain *d = ed->domain;
+    /* NB. There are 512 8-byte entries per GDT page. */
+    int i = 0, nr_pages = (entries + 511) / 512;
+    struct desc_struct *vgdt;
+    unsigned long pfn;
+
+    /* Check the first page in the new GDT. */
+    if ( (pfn = frames[0]) >= max_page )
+        goto fail;
+
+    /* The first page is special because Xen owns a range of entries in it. */
+    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
+    {
+        /* GDT checks failed: try zapping the Xen reserved entries. */
+        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
+            goto fail;
+        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
+        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
+               NR_RESERVED_GDT_ENTRIES*8);
+        unmap_domain_mem(vgdt);
+        put_page_and_type(&frame_table[pfn]);
+
+        /* Okay, we zapped the entries. Now try the GDT checks again. */
+        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
+            goto fail;
+    }
+
+    /* Check the remaining pages in the new GDT. */
+    for ( i = 1; i < nr_pages; i++ )
+        if ( ((pfn = frames[i]) >= max_page) ||
+             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
+            goto fail;
+
+    /* Copy reserved GDT entries to the new GDT. */
+    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
+    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
+           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
+           NR_RESERVED_GDT_ENTRIES*8);
+    unmap_domain_mem(vgdt);
+
+    /* Tear down the old GDT. */
+    destroy_gdt(ed);
+
+    /* Install the new GDT. */
+    for ( i = 0; i < nr_pages; i++ )
+        ed->arch.perdomain_ptes[i] =
+            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+
+    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
+    SET_GDT_ENTRIES(ed, entries);
+
+    return 0;
+
+ fail:
+    while ( i-- > 0 )
+        put_page_and_type(&frame_table[frames[i]]);
+    return -EINVAL;
+}
+
+
+long do_set_gdt(unsigned long *frame_list, unsigned int entries)
+{
+    int nr_pages = (entries + 511) / 512;
+    unsigned long frames[16];
+    long ret;
+
+    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
+        return -EINVAL;
+    
+    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
+        return -EFAULT;
+
+    LOCK_BIGLOCK(current->domain);
+
+    if ( (ret = set_gdt(current, frames, entries)) == 0 )
+    {
+        local_flush_tlb();
+        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
+    }
+
+    UNLOCK_BIGLOCK(current->domain);
+
+    return ret;
+}
+
+
+long do_update_descriptor(
+    unsigned long pa, unsigned long word1, unsigned long word2)
+{
+    unsigned long pfn = pa >> PAGE_SHIFT;
+    struct desc_struct *gdt_pent, d;
+    struct pfn_info *page;
+    struct exec_domain *ed;
+    long ret = -EINVAL;
+
+    d.a = (u32)word1;
+    d.b = (u32)word2;
+
+    LOCK_BIGLOCK(current->domain);
+
+    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
+        UNLOCK_BIGLOCK(current->domain);
+        return -EINVAL;
+    }
+
+    page = &frame_table[pfn];
+    if ( unlikely(!get_page(page, current->domain)) ) {
+        UNLOCK_BIGLOCK(current->domain);
+        return -EINVAL;
+    }
+
+    /* Check if the given frame is in use in an unsafe context. */
+    switch ( page->u.inuse.type_info & PGT_type_mask )
+    {
+    case PGT_gdt_page:
+        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
+        for_each_exec_domain(current->domain, ed) {
+            if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) &&
+                 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
+                 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
+                goto out;
+        }
+        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
+            goto out;
+        break;
+    case PGT_ldt_page:
+        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
+            goto out;
+        break;
+    default:
+        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
+            goto out;
+        break;
+    }
+
+    /* All is good so make the update. */
+    gdt_pent = map_domain_mem(pa);
+    memcpy(gdt_pent, &d, 8);
+    unmap_domain_mem(gdt_pent);
+
+    put_page_type(page);
+
+    ret = 0; /* success */
+
+ out:
+    put_page(page);
+
+    UNLOCK_BIGLOCK(current->domain);
+
+    return ret;
+}
+
+
+
+/*************************
+ * Writable Pagetables
+ */
+
+ptwr_info_t ptwr_info[NR_CPUS];
+
+#ifdef VERBOSE
+int ptwr_debug = 0x0;
+#define PTWR_PRINTK(_f, _a...) \
+ do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
+#define PTWR_PRINT_WHICH (which ? 'I' : 'A')
+#else
+#define PTWR_PRINTK(_f, _a...) ((void)0)
+#endif
+
+/* Flush the given writable p.t. page and write-protect it again. */
+void ptwr_flush(const int which)
+{
+    unsigned long  sstat, spte, pte, *ptep, l1va;
+    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
+    l2_pgentry_t  *pl2e;
+    int            i, cpu = smp_processor_id();
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain;
+
+    l1va = ptwr_info[cpu].ptinfo[which].l1va;
+    ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
+
+    /*
+     * STEP 1. Write-protect the p.t. page so no more updates can occur.
+     */
+
+    if ( unlikely(__get_user(pte, ptep)) )
+    {
+        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
+        /*
+         * Really a bug. We could read this PTE during the initial fault,
+         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
+         */
+        BUG();
+    }
+    PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
+                PTWR_PRINT_WHICH, ptep, pte);
+    pte &= ~_PAGE_RW;
+
+    if ( unlikely(d->arch.shadow_mode) )
+    {
+        /* Write-protect the p.t. page in the shadow page table. */
+        l1pte_propagate_from_guest(d, &pte, &spte);
+        __put_user(
+            spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
+
+        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
+        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
+        if ( sstat & PSH_shadowed )
+            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
+    }
+
+    /* Write-protect the p.t. page in the guest page table. */
+    if ( unlikely(__put_user(pte, ptep)) )
+    {
+        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
+        /*
+         * Really a bug. We could write this PTE during the initial fault,
+         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
+         */
+        BUG();
+    }
+
+    /* Ensure that there are no stale writable mappings in any TLB. */
+    /* NB. INVLPG is a serialising instruction: flushes pending updates. */
+#if 1
+    __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
+#else
+    flush_tlb_all();
+#endif
+    PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
+                PTWR_PRINT_WHICH, ptep, pte);
+
+    /*
+     * STEP 2. Validate any modified PTEs.
+     */
+
+    pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
+    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+    {
+        ol1e = ptwr_info[cpu].ptinfo[which].page[i];
+        nl1e = pl1e[i];
+
+        if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
+            continue;
+
+        /*
+         * Fast path for PTEs that have merely been write-protected
+         * (e.g., during a Unix fork()). A strict reduction in privilege.
+         */
+        if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
+        {
+            if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
+            {
+                if ( unlikely(sl1e != NULL) )
+                    l1pte_propagate_from_guest(
+                        d, &l1_pgentry_val(nl1e), 
+                        &l1_pgentry_val(sl1e[i]));
+                put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
+            }
+            continue;
+        }
+
+        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
+        {
+            MEM_LOG("ptwr: Could not re-validate l1 page\n");
+            /*
+             * Make the remaining p.t's consistent before crashing, so the
+             * reference counts are correct.
+             */
+            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
+                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
+            unmap_domain_mem(pl1e);
+            ptwr_info[cpu].ptinfo[which].l1va = 0;
+            UNLOCK_BIGLOCK(d);
+            domain_crash();
+        }
+        
+        if ( unlikely(sl1e != NULL) )
+            l1pte_propagate_from_guest(
+                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
+
+        if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
+            put_page_from_l1e(ol1e, d);
+    }
+    unmap_domain_mem(pl1e);
+
+    /*
+     * STEP 3. Reattach the L1 p.t. page into the current address space.
+     */
+
+    if ( (which == PTWR_PT_ACTIVE) && likely(!d->arch.shadow_mode) )
+    {
+        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
+        *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
+    }
+
+    /*
+     * STEP 4. Final tidy-up.
+     */
+
+    ptwr_info[cpu].ptinfo[which].l1va = 0;
+
+    if ( unlikely(sl1e != NULL) )
+    {
+        unmap_domain_mem(sl1e);
+        put_shadow_status(d);
+    }
+}
+
+/* Write page fault handler: check if guest is trying to modify a PTE. */
+int ptwr_do_page_fault(unsigned long addr)
+{
+    unsigned long    pte, pfn, l2e;
+    struct pfn_info *page;
+    l2_pgentry_t    *pl2e;
+    int              which, cpu = smp_processor_id();
+    u32              l2_idx;
+
+#ifdef __x86_64__
+    return 0; /* Writable pagetables need fixing for x86_64. */
+#endif
+
+    /*
+     * Attempt to read the PTE that maps the VA being accessed. By checking for
+     * PDE validity in the L2 we avoid many expensive fixups in __get_user().
+     */
+    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
+           _PAGE_PRESENT) ||
+         __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
+    {
+        return 0;
+    }
+
+    pfn  = pte >> PAGE_SHIFT;
+    page = &frame_table[pfn];
+
+    /* We are looking only for read-only mappings of p.t. pages. */
+    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
+         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
+    {
+        return 0;
+    }
+    
+    /* Get the L2 index at which this L1 p.t. is always mapped. */
+    l2_idx = page->u.inuse.type_info & PGT_va_mask;
+    if ( unlikely(l2_idx >= PGT_va_unknown) )
+    {
+        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
+    }
+    l2_idx >>= PGT_va_shift;
+
+    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
+    {
+        MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
+        domain_crash();
+    }
+
+    /*
+     * Is the L1 p.t. mapped into the current address space? If so we call it
+     * an ACTIVE p.t., otherwise it is INACTIVE.
+     */
+    pl2e = &linear_l2_table[l2_idx];
+    l2e  = l2_pgentry_val(*pl2e);
+    which = PTWR_PT_INACTIVE;
+    if ( (l2e >> PAGE_SHIFT) == pfn )
+    {
+        /* Check the PRESENT bit to set ACTIVE. */
+        if ( likely(l2e & _PAGE_PRESENT) )
+            which = PTWR_PT_ACTIVE;
+        else {
+            /*
+             * If the PRESENT bit is clear, we may be conflicting with
+             * the current ACTIVE p.t. (it may be the same p.t. mapped
+             * at another virt addr).
+             * The ptwr_flush call below will restore the PRESENT bit.
+             */
+            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
+                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
+                which = PTWR_PT_ACTIVE;
+        }
+    }
+    
+    PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
+                "pfn %08lx\n", PTWR_PRINT_WHICH,
+                addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
+    
+    /*
+     * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 
+     * time. If there is already one, we must flush it out.
+     */
+    if ( ptwr_info[cpu].ptinfo[which].l1va )
+        ptwr_flush(which);
+
+    ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
+    ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
+    
+    /* For safety, disconnect the L1 p.t. page from current space. */
+    if ( (which == PTWR_PT_ACTIVE) && 
+         likely(!current->domain->arch.shadow_mode) )
+    {
+        *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
+#if 1
+        flush_tlb(); /* XXX Multi-CPU guests? */
+#else
+        flush_tlb_all();
+#endif
+    }
+    
+    /* Temporarily map the L1 page, and make a copy of it. */
+    ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
+    memcpy(ptwr_info[cpu].ptinfo[which].page,
+           ptwr_info[cpu].ptinfo[which].pl1e,
+           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
+    
+    /* Finally, make the p.t. page writable by the guest OS. */
+    pte |= _PAGE_RW;
+    PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
+                &linear_pg_table[addr>>PAGE_SHIFT], pte);
+    if ( unlikely(__put_user(pte, (unsigned long *)
+                             &linear_pg_table[addr>>PAGE_SHIFT])) )
+    {
+        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
+                &linear_pg_table[addr>>PAGE_SHIFT]);
+        /* Toss the writable pagetable state and crash. */
+        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
+        ptwr_info[cpu].ptinfo[which].l1va = 0;
+        domain_crash();
+    }
+    
+    return EXCRET_fault_fixed;
+}
+
+static __init int ptwr_init(void)
+{
+    int i;
+
+    for ( i = 0; i < smp_num_cpus; i++ )
+    {
+        ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
+            (void *)alloc_xenheap_page();
+        ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
+            (void *)alloc_xenheap_page();
+    }
+
+    return 0;
+}
+__initcall(ptwr_init);
+
+
+
+
+/************************************************************************/
+/************************************************************************/
+/************************************************************************/
+
+#ifndef NDEBUG
+
+void ptwr_status(void)
+{
+    unsigned long pte, *ptep, pfn;
+    struct pfn_info *page;
+    int cpu = smp_processor_id();
+
+    ptep = (unsigned long *)&linear_pg_table
+        [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
+
+    if ( __get_user(pte, ptep) ) {
+        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
+        domain_crash();
+    }
+
+    pfn = pte >> PAGE_SHIFT;
+    page = &frame_table[pfn];
+    printk("need to alloc l1 page %p\n", page);
+    /* make pt page writable */
+    printk("need to make read-only l1-page at %p is %08lx\n",
+           ptep, pte);
+
+    if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
+        return;
+
+    if ( __get_user(pte, (unsigned long *)
+                    ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
+        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
+                ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
+        domain_crash();
+    }
+    pfn = pte >> PAGE_SHIFT;
+    page = &frame_table[pfn];
+}
+
+void audit_domain(struct domain *d)
+{
+    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
+
+    void adjust (struct pfn_info *page, int dir, int adjtype)
+    {
+        int count = page->count_info & PGC_count_mask;
+
+        if ( adjtype )
+        {
+            int tcount = page->u.inuse.type_info & PGT_count_mask;
+            
+            ttot++;
+
+            tcount += dir;
+
+            if ( tcount < 0 )
+            {
+                /* This will only come out once. */
+                printk("Audit %d: type count whent below zero pfn=%x "
+                       "taf=%x otaf=%x\n",
+                       d->id, page-frame_table,
+                       page->u.inuse.type_info,
+                       page->tlbflush_timestamp);
+            }
+            
+            page->u.inuse.type_info =
+                (page->u.inuse.type_info & ~PGT_count_mask) | 
+                (tcount & PGT_count_mask);
+        }
+
+        ctot++;
+        count += dir;
+        if ( count < 0 )
+        {
+            /* This will only come out once. */
+            printk("Audit %d: general count whent below zero pfn=%x "
+                   "taf=%x otaf=%x\n",
+                   d->id, page-frame_table,
+                   page->u.inuse.type_info,
+                   page->tlbflush_timestamp);
+        }
+            
+        page->count_info =
+            (page->count_info & ~PGC_count_mask) | 
+            (count & PGC_count_mask);            
+
+    }
+
+    void scan_for_pfn(struct domain *d, unsigned long xpfn)
+    {
+        unsigned long pfn, *pt;
+        struct list_head *list_ent;
+        struct pfn_info *page;
+        int i;
+
+        list_ent = d->page_list.next;
+        for ( i = 0; (list_ent != &d->page_list); i++ )
+        {
+            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+            page = &frame_table[pfn];
+            
+            switch ( page->u.inuse.type_info & PGT_type_mask )
+            {
+            case PGT_l1_page_table:
+            case PGT_l2_page_table:
+                pt = map_domain_mem(pfn<<PAGE_SHIFT);
+                for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+                    if ( (pt[i] & _PAGE_PRESENT) &&
+                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
+                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
+                               d->id, i, pfn, page->u.inuse.type_info,
+                               page->count_info);
+                unmap_domain_mem(pt);           
+            }
+
+            list_ent = frame_table[pfn].list.next;
+        }
+
+    }
+
+    void scan_for_pfn_remote(unsigned long xpfn)
+    {
+        struct domain *e;
+        for_each_domain ( e )
+            scan_for_pfn( e, xpfn );            
+    }   
+
+    int i;
+    unsigned long pfn;
+    struct list_head *list_ent;
+    struct pfn_info *page;
+
+    if ( d != current->domain )
+        domain_pause(d);
+    synchronise_pagetables(~0UL);
+
+    printk("pt base=%lx sh_info=%x\n",
+           pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
+           virt_to_page(d->shared_info)-frame_table);
+           
+    spin_lock(&d->page_alloc_lock);
+
+    /* PHASE 0 */
+
+    list_ent = d->page_list.next;
+    for ( i = 0; (list_ent != &d->page_list); i++ )
+    {
+        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
+        page = &frame_table[pfn];
+
+        if ( page_get_owner(page) != d )
+            BUG();
+
+        if ( (page->u.inuse.type_info & PGT_count_mask) >
+             (page->count_info & PGC_count_mask) )
+            printk("taf > caf %x %x pfn=%lx\n",
+                   page->u.inuse.type_info, page->count_info, pfn );
+#if 0   /* SYSV shared memory pages plus writeable files. */
+        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
+             (page->u.inuse.type_info & PGT_count_mask) > 1 )
+        {
+            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
+                  pfn,
+                  page->u.inuse.type_info,
+                  page->count_info );
+            scan_for_pfn_remote(pfn);
+        }
+#endif
+        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
+             (page->u.inuse.type_info & PGT_count_mask) > 1 )
+        {
+            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
+                  pfn,
+                  page->u.inuse.type_info,
+                  page->count_info );
+        }
+
+        /* Use tlbflush_timestamp to store original type_info. */
+        page->tlbflush_timestamp = page->u.inuse.type_info;
+
+        list_ent = frame_table[pfn].list.next;
+    }
+
+
+    /* PHASE 1 */
+
+    adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
+
+    list_ent = d->page_list.next;
+    for ( i = 0; (list_ent != &d->page_list); i++ )
+    {
+        unsigned long *pt;
+        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
+        page = &frame_table[pfn];
+
+        if ( page_get_owner(page) != d )
+            BUG();
+
+        switch ( page->u.inuse.type_info & PGT_type_mask )
+        {
+        case PGT_l2_page_table:
+
+            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
+                printk("Audit %d: L2 not validated %x\n",
+                       d->id, page->u.inuse.type_info);
+
+            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+                printk("Audit %d: L2 not pinned %x\n",
+                       d->id, page->u.inuse.type_info);
+            else
+                adjust( page, -1, 1 );
+           
+            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+
+            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+            {
+                if ( pt[i] & _PAGE_PRESENT )
+                {
+                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+                    struct pfn_info *l1page = &frame_table[l1pfn];
+
+                    if ( page_get_owner(l1page) != d )
+                    {
+                        printk("L2: Skip bizarre page belonging to other "
+                               "dom %p\n", page_get_owner(l1page));
+                        continue;
+                    }
+                    
+                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
+                         PGT_l2_page_table )
+                        printk("Audit %d: [%x] Found %s Linear PT "
+                               "t=%x pfn=%lx\n", d->id, i, 
+                               (l1pfn==pfn) ? "Self" : "Other",
+                               l1page->u.inuse.type_info,
+                               l1pfn);
+                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
+                              PGT_l1_page_table )
+                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
+                               d->id, i,
+                               l1page->u.inuse.type_info,
+                               l1pfn);
+
+                    adjust(l1page, -1, 1);
+                }
+            }
+
+            unmap_domain_mem(pt);
+
+            break;
+
+
+        case PGT_l1_page_table:
+            
+            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
+                adjust( page, -1, 1 );
+
+            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
+                printk("Audit %d: L1 not validated %x\n",
+                       d->id, page->u.inuse.type_info);
+#if 0
+            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+                printk("Audit %d: L1 not pinned %x\n",
+                       d->id, page->u.inuse.type_info);
+#endif
+            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+
+            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+            {
+                if ( pt[i] & _PAGE_PRESENT )
+                {
+                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+                    struct pfn_info *l1page = &frame_table[l1pfn];
+
+                    if ( l1pfn < 0x100 )
+                    {
+                        lowmem_mappings++;
+                        continue;
+                    }
+
+                    if ( l1pfn > max_page )
+                    {
+                        io_mappings++;
+                        continue;
+                    }
+
+                    if ( pt[i] & _PAGE_RW )
+                    {
+
+                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
+                             PGT_l1_page_table ||
+                             (l1page->u.inuse.type_info & PGT_type_mask) ==
+                             PGT_l2_page_table )
+                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
+                                   d->id, i,
+                                   l1page->u.inuse.type_info,
+                                   l1pfn);
+
+                    }
+
+                    if ( page_get_owner(l1page) != d )
+                    {
+                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
+                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
+                               d->id, pfn, i,
+                               page_get_owner(l1page),
+                               l1pfn,
+                               l1page->count_info,
+                               l1page->u.inuse.type_info,
+                               machine_to_phys_mapping[l1pfn]);    
+                        continue;
+                    }
+
+                    adjust(l1page, -1, 0);
+                }
+            }
+
+            unmap_domain_mem(pt);
+
+            break;
+        }       
+
+        list_ent = frame_table[pfn].list.next;
+    }
+
+    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
+        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
+               d->id, lowmem_mappings, io_mappings);
+
+    /* PHASE 2 */
+
+    ctot = ttot = 0;
+    list_ent = d->page_list.next;
+    for ( i = 0; (list_ent != &d->page_list); i++ )
+    {
+        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+        page = &frame_table[pfn];
+
+        switch ( page->u.inuse.type_info & PGT_type_mask)
+        {
+        case PGT_l1_page_table:
+        case PGT_l2_page_table:
+            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
+            {
+                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
+                       d->id, page->u.inuse.type_info, 
+                       page->tlbflush_timestamp,
+                       page->count_info, pfn );
+                scan_for_pfn_remote(pfn);
+            }
+        default:
+            if ( (page->count_info & PGC_count_mask) != 1 )
+            {
+                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
+                       d->id, 
+                       page->count_info,
+                       page->u.inuse.type_info, 
+                       page->tlbflush_timestamp, pfn );
+                scan_for_pfn_remote(pfn);
+            }
+            break;
+        }
+
+        list_ent = frame_table[pfn].list.next;
+    }
+
+    /* PHASE 3 */
+    list_ent = d->page_list.next;
+    for ( i = 0; (list_ent != &d->page_list); i++ )
+    {
+        unsigned long *pt;
+        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+        page = &frame_table[pfn];
+
+        switch ( page->u.inuse.type_info & PGT_type_mask )
+        {
+        case PGT_l2_page_table:
+            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
+                adjust( page, 1, 1 );          
+
+            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+
+            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+            {
+                if ( pt[i] & _PAGE_PRESENT )
+                {
+                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+                    struct pfn_info *l1page;
+
+                    if (l1pfn>max_page)
+                        continue;
+
+                    l1page = &frame_table[l1pfn];
+
+                    if ( page_get_owner(l1page) == d )
+                        adjust(l1page, 1, 1);
+                }
+            }
+
+            unmap_domain_mem(pt);
+            break;
+
+        case PGT_l1_page_table:
+            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
+                adjust( page, 1, 1 );
+
+            pt = map_domain_mem( pfn<<PAGE_SHIFT );
+
+            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+            {
+                if ( pt[i] & _PAGE_PRESENT )
+                {
+                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
+                    struct pfn_info *l1page;
+
+                    if (l1pfn>max_page)
+                        continue;
+
+                    l1page = &frame_table[l1pfn];
+
+                    if ( (page_get_owner(l1page) != d) ||
+                         (l1pfn < 0x100) || (l1pfn > max_page) )
+                        continue;
+
+                    adjust(l1page, 1, 0);
+                }
+            }
+
+            unmap_domain_mem(pt);
+            break;
+        }
+
+
+        page->tlbflush_timestamp = 0;
+
+        list_ent = frame_table[pfn].list.next;
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    adjust(&frame_table[pagetable_val(
+        d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
+
+    printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
+
+    if ( d != current->domain )
+        domain_unpause(d);
+}
+
+void audit_domains(void)
+{
+    struct domain *d;
+    for_each_domain ( d )
+        audit_domain(d);
+}
+
+void audit_domains_key(unsigned char key)
+{
+    audit_domains();
+}
+
+#endif
index 301d7a765e788232eaf76004bcbfa465d25af360..d867cd8f12007ee96ee619d9925936dc37ed8c04 100644 (file)
@@ -522,7 +522,7 @@ asmlinkage int do_general_protection(struct xen_regs *regs)
 
     /* Emulate some simple privileged instructions when exec'ed in ring 1. */
     if ( (regs->error_code == 0) &&
-         RING_1(regs) &&
+         GUESTOS_FAULT(regs) &&
          emulate_privileged_op(regs) )
         return 0;
 
index d5c272e301624e4a7d9e99bb1630f8f0230ebeea..f1a7ee0c8c9a9b700b58b1d91468444f52c5102a 100644 (file)
@@ -122,7 +122,7 @@ free_dom_mem(struct domain *d,
 long
 do_dom_mem_op(unsigned long  op, 
               unsigned long *extent_list, 
-              unsigned long  nr_extents,
+              unsigned int   nr_extents,
               unsigned int   extent_order,
               domid_t        domid)
 {
@@ -133,8 +133,7 @@ do_dom_mem_op(unsigned long  op,
     start_extent  = op >> START_EXTENT_SHIFT;
     op           &= (1 << START_EXTENT_SHIFT) - 1;
 
-    if ( unlikely(start_extent > nr_extents) || 
-         unlikely(nr_extents > ~0U) ) /* can pack into a uint? */
+    if ( unlikely(start_extent > nr_extents) )
         return -EINVAL;
 
     if ( likely(domid == DOMID_SELF) )
@@ -150,13 +149,11 @@ do_dom_mem_op(unsigned long  op,
     {
     case MEMOP_increase_reservation:
         rc = alloc_dom_mem(
-            d, extent_list, start_extent, 
-            (unsigned int)nr_extents, extent_order);
+            d, extent_list, start_extent, nr_extents, extent_order);
         break;
     case MEMOP_decrease_reservation:
         rc = free_dom_mem(
-            d, extent_list, start_extent, 
-            (unsigned int)nr_extents, extent_order);
+            d, extent_list, start_extent, nr_extents, extent_order);
         break;
     default:
         rc = -ENOSYS;
index 06346dc9d927acc4cbcf02ff3154da724bcf9cc9..7f3dad32fb9af1b8b4691d041a35a6627327e499 100644 (file)
@@ -99,6 +99,13 @@ typedef struct { unsigned long l4_lo; } l4_pgentry_t;
   (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1))
 #endif
 
+/* Given a virtual address, get an entry offset into a linear page table. */
+#if defined(__i386__)
+#define l1_linear_offset(_a) ((_a) >> PAGE_SHIFT)
+#elif defined(__x86_64__)
+#define l1_linear_offset(_a) (((_a) & ((1UL << 48) - 1)) >> PAGE_SHIFT)
+#endif
+
 #if defined(__i386__)
 #define pagetable_t l2_pgentry_t
 #define pagetable_val(_x)  ((_x).l2_lo)
index e28f17fba65e534de17a9a92b802ea6d1629779f..ca05876ac6c119db8898a1809dc9135ace70d204 100644 (file)
@@ -39,4 +39,6 @@ struct xen_regs
 #define RING_2(_r)    (((_r)->cs & 3) == 2)
 #define RING_3(_r)    (((_r)->cs & 3) == 3)
 
+#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_1(_r))
+
 #endif
index 0169109fe3a1bb664907fb4bd7b998449986e603..d09bddccdb8cd3eecbe558a56ed8f479876be91a 100644 (file)
@@ -36,4 +36,6 @@ struct xen_regs
 #define RING_2(_r)    (((_r)->cs & 3) == 2)
 #define RING_3(_r)    (((_r)->cs & 3) == 3)
 
+#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_3(_r))
+
 #endif
index da3d8f5c1f848004d97ddf16f4fce910b88b5e26..bb23ae81a4456cc5f76ee741b71f944485c7300f 100644 (file)
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
 
-#define __addr_ok(addr) ((unsigned long)(addr) < HYPERVISOR_VIRT_START)
-
 /*
- * Test whether a block of memory is a valid user space address.
- * Returns 0 if the range is valid, nonzero otherwise.
- *
- * This is equivalent to the following test:
- * ((u65)addr >= (u65)HYPERVISOR_VIRT_END) ?
- * (((u65)addr + (u65)size) >= ((u65)1 << 64)) :
- * (((u65)addr + (u65)size) >= ((u65)HYPERVISOR_VIRT_START))
+ * Valid if in +ve half of 48-bit address space, or above Xen-reserved area.
+ * This is also valid for range checks (addr, addr+size). As long as the
+ * start address is outside the Xen-reserved area then we will access a
+ * non-canonical address (and thus fault) before ever reaching VIRT_START.
  */
-#define __range_not_ok(addr,size) ({ \
-    unsigned long flag,sum; \
-    if ((unsigned long)addr >= HYPERVISOR_VIRT_END) \
-        asm("addq %3,%1 ; sbbq %0,%0" \
-            :"=&r" (flag), "=r" (sum) \
-            :"1" (addr),"g" ((long)(size))); \
-    else \
-        asm("addq %3,%1 ; sbbq %0,%0 ; cmpq %1,%4 ; sbbq $0,%0"  \
-            :"=&r" (flag), "=r" (sum) \
-            :"1" (addr),"g" ((long)(size)),"r" (HYPERVISOR_VIRT_START)); \
-    flag; })
+#define __addr_ok(addr) \
+    (((unsigned long)(addr) < (1UL<<48)) || \
+     ((unsigned long)(addr) >= HYPERVISOR_VIRT_END))
 
-#define access_ok(type, addr, size) (__range_not_ok(addr,size) == 0)
+#define access_ok(type, addr, size) (__addr_ok(addr))
 
-#define array_access_ok(type,addr,count,size)                    \
-    (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ &&  \
-     access_ok(type,addr,(unsigned long)count*(unsigned long)size))
+#define array_access_ok(type,addr,count,size) (__addr_ok(addr))
 
 extern long __get_user_bad(void);
 extern void __put_user_bad(void);